diff --git a/scripts/eval/yamls/eval_gauntlet.yaml b/scripts/eval/yamls/eval_gauntlet.yaml index 1d2fa34139..791023abcf 100644 --- a/scripts/eval/yamls/eval_gauntlet.yaml +++ b/scripts/eval/yamls/eval_gauntlet.yaml @@ -133,32 +133,32 @@ eval_gauntlet: - name: boolq num_fewshot: 10 random_baseline: 0.5 - - name: programming - benchmarks: - - name: human_eval - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_cpp - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_js - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_return_simple - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_return_complex - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_25 - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_50 - num_fewshot: 0 - random_baseline: 0.0 - - name: human_eval_75 - num_fewshot: 0 - random_baseline: 0.0 + # - name: programming + # benchmarks: + # - name: human_eval + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_cpp + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_js + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_return_simple + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_return_complex + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_25 + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_50 + # num_fewshot: 0 + # random_baseline: 0.0 + # - name: human_eval_75 + # num_fewshot: 0 + # random_baseline: 0.0 - name: world_knowledge_lm_task_subscore benchmarks: - name: jeopardy @@ -258,8 +258,3 @@ eval_gauntlet: - name: squad num_fewshot: 10 random_baseline: 0 - - name: programming_lite - benchmarks: - - name: human_eval - num_fewshot: 0 - random_baseline: 0.0 diff --git a/scripts/eval/yamls/tasks.yaml b/scripts/eval/yamls/tasks.yaml index 6b66c116ea..737b08ebeb 100644 --- a/scripts/eval/yamls/tasks.yaml +++ b/scripts/eval/yamls/tasks.yaml @@ -173,67 +173,67 @@ icl_tasks: num_fewshot: [10] icl_task_type: multiple_choice continuation_delimiter: "\nAnswer: " # this separates questions from answers -- - label: human_eval - dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_cpp - dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_js - dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_return_simple - dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_return_complex - dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_25 - dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_50 - dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation -- - label: human_eval_75 - dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI - num_fewshot: [0] - pass_at_k: 1 - num_beams: 20 - batch_size: 1 - icl_task_type: code_evaluation +# - +# label: human_eval +# dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_cpp +# dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_js +# dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_return_simple +# dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_return_complex +# dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_25 +# dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_50 +# dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation +# - +# label: human_eval_75 +# dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI +# num_fewshot: [0] +# pass_at_k: 1 +# num_beams: 20 +# batch_size: 1 +# icl_task_type: code_evaluation