Skip to content

Commit

Permalink
Add Programming to Foundry (#441)
Browse files Browse the repository at this point in the history
* add programming to the gauntlet

* fix pre-commit

* add to builders

* remove programming from gauntlet yaml before v0.1

* add language to human eval jsonl

* test multilingual

* add data

* fix typo

* revert yamls

* add beams to yaml

* final fixes

* final fixes

* upgrade composer

* change beam #s

* upgrade tensorbaord

* remove c dataset:

---------

Co-authored-by: bcui19 <[email protected]>
  • Loading branch information
rishab-partha and bcui19 committed Aug 24, 2023
1 parent 2f30418 commit 52a3500
Show file tree
Hide file tree
Showing 10 changed files with 546 additions and 4 deletions.
4 changes: 3 additions & 1 deletion llmfoundry/models/hf/hf_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

# required for loading a python model into composer
import transformers
from composer.metrics.nlp import (InContextLearningLMAccuracy,
from composer.metrics.nlp import (InContextLearningCodeEvalAccuracy,
InContextLearningLMAccuracy,
InContextLearningLMExpectedCalibrationError,
InContextLearningMCExpectedCalibrationError,
InContextLearningMultipleChoiceAccuracy,
Expand Down Expand Up @@ -74,6 +75,7 @@ def __init__(
InContextLearningLMAccuracy(),
InContextLearningMultipleChoiceAccuracy(),
InContextLearningQAAccuracy(),
InContextLearningCodeEvalAccuracy(),
InContextLearningLMExpectedCalibrationError(),
InContextLearningMCExpectedCalibrationError()
]
Expand Down
4 changes: 3 additions & 1 deletion llmfoundry/models/mpt/modeling_mpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
from composer.metrics import (InContextLearningLMAccuracy,
from composer.metrics import (InContextLearningCodeEvalAccuracy,
InContextLearningLMAccuracy,
InContextLearningLMExpectedCalibrationError,
InContextLearningMCExpectedCalibrationError,
InContextLearningMultipleChoiceAccuracy,
Expand Down Expand Up @@ -698,6 +699,7 @@ def __init__(
InContextLearningLMAccuracy(),
InContextLearningMultipleChoiceAccuracy(),
InContextLearningQAAccuracy(),
InContextLearningCodeEvalAccuracy(),
InContextLearningLMExpectedCalibrationError(),
InContextLearningMCExpectedCalibrationError(),
]
Expand Down
5 changes: 5 additions & 0 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ def _validate_cfg(icl_cfg: DictConfig):
]
elif icl_cfg.icl_task_type == 'question_answering':
icl_cfg.metric_names = ['InContextLearningQAAccuracy']
elif icl_cfg.icl_task_type == 'code_evaluation':
icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy']
else:
raise ValueError(
f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.'
Expand All @@ -189,6 +191,8 @@ def _validate_cfg(icl_cfg: DictConfig):
icl_cfg.max_seq_len = default_max_seq_len
if 'batch_size' not in icl_cfg:
icl_cfg.batch_size = default_batch_size
if 'num_beams' not in icl_cfg:
icl_cfg.num_beams = 1

for icl_cfg in icl_tasks_list:
_validate_cfg(icl_cfg)
Expand Down Expand Up @@ -218,6 +222,7 @@ def _validate_cfg(icl_cfg: DictConfig):
example_delimiter=icl_cfg.example_delimiter,
continuation_delimiter=icl_cfg.continuation_delimiter,
destination_path=destination_path,
generations_per_sample=icl_cfg.num_beams,
has_categories=icl_cfg.get('has_categories', False),
)
if hasattr(
Expand Down
164 changes: 164 additions & 0 deletions scripts/eval/local_data/programming/human_eval.jsonl

Large diffs are not rendered by default.

161 changes: 161 additions & 0 deletions scripts/eval/local_data/programming/processed_humaneval_c++.jsonl

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{"task_id": "C/1", "prompt": "/*\nGiven a positive floating point number, it can be decomposed into\nand integer part (largest integer smaller than given number) and decimals\n(leftover part always smaller than 1).\n\nReturn the decimal part of the number.\n>>> truncate_number(3.5)\n0.5\n*/\n#include<stdio.h>\n#include<math.h>\nfloat truncate_number(float number){\n", "canonical_solution": " return number-(int)(number);\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (truncate_number(3.5) == 0.5); \n assert (abs(truncate_number(1.33) - 0.33) < 1e-4);\n assert (abs(truncate_number(123.456) - 0.456) < 1e-4);\n}", "entry_point": "truncate_number", "test_inputs": ["3.5", "1.33", "123.456"], "test_outputs": ["0.5", "0.33", "0.456"], "language": "c"}
{"task_id": "C/2", "prompt": "/*\nReturn a greatest common divisor of two integers a and b\n>>> greatest_common_divisor(3, 5)\n1\n>>> greatest_common_divisor(25, 15)\n5\n*/\n#include<stdio.h>\n#include<stdbool.h>\nint greatest_common_divisor(int a, int b){\n", "canonical_solution": " int out,m;\n while (true){\n if (a<b) \n {\n m=a;a=b;b=m;\n }\n a=a%b;\n if (a==0) return b;\n }\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (greatest_common_divisor(3, 7) == 1);\n assert (greatest_common_divisor(10, 15) == 5);\n assert (greatest_common_divisor(49, 14) == 7);\n assert (greatest_common_divisor(144, 60) == 12);\n}\n", "entry_point": "greatest_common_divisor", "test_inputs": ["3, 7", "10, 15", "49, 14", "144, 60"], "test_outputs": ["1", "5", "7", "12"], "language": "c"}
{"task_id": "C/3", "prompt": "/*\nFor a given number n, find the largest number that divides n evenly, smaller than n\n>>> largest_divisor(15)\n5\n*/\n#include<stdio.h>\nint largest_divisor(int n){\n", "canonical_solution": " for (int i=2;i*i<=n;i++)\n if (n%i==0) return n/i;\n return 1;\n\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (largest_divisor(3) == 1);\n assert (largest_divisor(7) == 1);\n assert (largest_divisor(10) == 5);\n assert (largest_divisor(100) == 50);\n assert (largest_divisor(49) == 7);\n}\n", "entry_point": "largest_divisor", "test_inputs": ["3", "7", "10", "100", "49"], "test_outputs": ["1", "1", "5", "50", "7"], "language": "c"}
{"task_id": "C/4", "prompt": "/*\nReturn true if a given number is prime, and false otherwise.\n>>> is_prime(6)\nfalse\n>>> is_prime(101)\ntrue\n>>> is_prime(11)\ntrue\n>>> is_prime(13441)\ntrue\n>>> is_prime(61)\ntrue\n>>> is_prime(4)\nfalse\n>>> is_prime(1)\nfalse\n*/\n#include<stdio.h>\n#include<stdbool.h>\nbool is_prime(long long n){\n", "canonical_solution": " if (n<2) return false;\n for (long long i=2;i*i<=n;i++)\n if (n%i==0) return false;\n return true;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (is_prime(6) == false);\n assert (is_prime(101) == true);\n assert (is_prime(11) == true);\n assert (is_prime(13441) == true);\n assert (is_prime(61) == true);\n assert (is_prime(4) == false);\n assert (is_prime(1) == false);\n assert (is_prime(5) == true);\n assert (is_prime(11) == true);\n assert (is_prime(17) == true);\n assert (is_prime(5 * 17) == false);\n assert (is_prime(11 * 7) == false);\n assert (is_prime(13441 * 19) == false);\n}\n", "entry_point": "is_prime", "test_inputs": ["6", "101", "11", "13441", "61", "4", "1", "5", "11", "17", "5 * 17", "11 * 7", "13441 * 19"], "test_outputs": ["false", "true", "true", "true", "true", "false", "false", "true", "true", "true", "false", "false", "false"], "language": "c"}
{"task_id": "C/5", "prompt": "/*\nReturn the number of times the digit 7 appears in integers less than n which are divisible by 11 or 13.\n>>> fizz_buzz(50)\n0\n>>> fizz_buzz(78)\n2\n>>> fizz_buzz(79)\n3\n*/\n#include<stdio.h>\nint fizz_buzz(int n){\n", "canonical_solution": " int count=0;\n for (int i=0;i<n;i++)\n if (i%11==0 || i%13==0)\n {\n int q=i;\n while (q>0)\n {\n if (q%10==7) count+=1;\n q=q/10;\n }\n } \n return count;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (fizz_buzz(50) == 0);\n assert (fizz_buzz(78) == 2);\n assert (fizz_buzz(79) == 3);\n assert (fizz_buzz(100) == 3);\n assert (fizz_buzz(200) == 6);\n assert (fizz_buzz(4000) == 192);\n assert (fizz_buzz(10000) == 639);\n assert (fizz_buzz(100000) == 8026);\n}\n", "entry_point": "fizz_buzz", "test_inputs": ["50", "78", "79", "100", "200", "4000", "10000", "100000"], "test_outputs": ["0", "2", "3", "3", "6", "192", "639", "8026"], "language": "c"}
{"task_id": "C/6", "prompt": "/*\nprime_fib returns n-th number that is a Fibonacci number and it's also prime.\n>>> prime_fib(1)\n2\n>>> prime_fib(2)\n3\n>>> prime_fib(3)\n5\n>>> prime_fib(4)\n13\n>>> prime_fib(5)\n89\n*/\n#include<stdio.h>\n#include<stdbool.h>\nint prime_fib(int n){\n", "canonical_solution": " int f1,f2,m;\n f1=1;f2=2;\n int count=0;\n while (count<n)\n {\n f1=f1+f2;\n m=f1;f1=f2;f2=m;\n bool isprime=true;\n for (int w=2;w*w<=f1;w++)\n if (f1%w==0)\n {\n isprime=false; break;\n }\n if (isprime) count+=1;\n if (count==n) return f1;\n }\n\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (prime_fib(1) == 2);\n assert (prime_fib(2) == 3);\n assert (prime_fib(3) == 5);\n assert (prime_fib(4) == 13);\n assert (prime_fib(5) == 89);\n assert (prime_fib(6) == 233);\n assert (prime_fib(7) == 1597);\n assert (prime_fib(8) == 28657);\n assert (prime_fib(9) == 514229);\n assert (prime_fib(10) == 433494437);\n}\n", "entry_point": "prime_fib", "test_inputs": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], "test_outputs": ["2", "3", "5", "13", "89", "233", "1597", "28657", "514229", "433494437"], "language": "c"}
{"task_id": "C/7", "prompt": "/*\nImagine a road that's a perfectly straight infinitely long line.\nn cars are driving left to right; simultaneously, a different set of n cars\nare driving right to left. The two sets of cars start out being very far from\neach other. All cars move in the same speed. Two cars are said to collide\nwhen a car that's moving left to right hits a car that's moving right to left.\nHowever, the cars are infinitely sturdy and strong; as a result, they continue moving\nin their trajectory as if they did not collide.\n\nThis function outputs the number of such collisions.\n*/\n#include<stdio.h>\nint car_race_collision(int n){\n", "canonical_solution": " return n*n;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (car_race_collision(2) == 4);\n assert (car_race_collision(3) == 9);\n assert (car_race_collision(4) == 16);\n assert (car_race_collision(8) == 64);\n assert (car_race_collision(10) == 100);\n}\n", "entry_point": "car_race_collision", "test_inputs": ["2", "3", "4", "8", "10"], "test_outputs": ["4", "9", "16", "64", "100"], "language": "c"}
{"task_id": "C/8", "prompt": "/*\nThe Fib4 number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:\nfib4(0) -> 0\nfib4(1) -> 0\nfib4(2) -> 2\nfib4(3) -> 0\nfib4(n) -> fib4(n-1) + fib4(n-2) + fib4(n-3) + fib4(n-4).\nPlease write a function to efficiently compute the n-th element of the fib4 number sequence. Do not use recursion.\n>>> fib4(5)\n4\n>>> fib4(6)\n8\n>>> fib4(7)\n14\n*/\n#include<stdio.h>\nint fib4(int n){\n", "canonical_solution": " int f[100];\n f[0]=0;\n f[1]=0;\n f[2]=2;\n f[3]=0;\n for (int i=4;i<=n;i++)\n {\n f[i]=f[i-1]+f[i-2]+f[i-3]+f[i-4];\n }\n return f[n];\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (fib4(5) == 4);\n assert (fib4(8) == 28);\n assert (fib4(10) == 104);\n assert (fib4(12) == 386);\n}\n", "entry_point": "fib4", "test_inputs": ["5", "8", "10", "12"], "test_outputs": ["4", "28", "104", "386"], "language": "c"}
{"task_id": "C/9", "prompt": "/*\nReturn 2^n modulo p (be aware of numerics).\n>>> modp(3, 5)\n3\n>>> modp(1101, 101)\n2\n>>> modp(0, 101)\n1\n>>> modp(3, 11)\n8\n>>> modp(100, 101)\n1\n*/\n#include<stdio.h>\nint modp(int n,int p){\n", "canonical_solution": " int out=1;\n for (int i=0;i<n;i++)\n out=(out*2)%p;\n return out;\n}\n", "test": "#undef NDEBUG\n#include<assert.h>\nint main(){\n assert (modp(3, 5) == 3);\n assert (modp(1101, 101) == 2);\n assert (modp(0, 101) == 1);\n assert (modp(3, 11) == 8);\n assert (modp(100, 101) == 1);\n assert (modp(30, 5) == 4);\n assert (modp(31, 5) == 3);\n}\n", "entry_point": "modp", "test_inputs": ["3, 5", "1101, 101", "0, 101", "3, 11", "100, 101", "30, 5", "31, 5"], "test_outputs": ["3", "2", "1", "8", "1", "4", "3"], "language": "c"}
Loading

0 comments on commit 52a3500

Please sign in to comment.