Skip to content

Commit

Permalink
Remove HumanEval tasks from ICL eval (#715)
Browse files Browse the repository at this point in the history
* add params

* fix fsdp config

* update

* change model config

* comment out human eval

* remove boolq

* actually remove real boolq

* unroll boolq

* really actully comment out humaneval

* remove file

* lint fix

* lint fix
  • Loading branch information
tbarton16 committed Nov 6, 2023
1 parent ca8e6b5 commit be467ae
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 95 deletions.
57 changes: 26 additions & 31 deletions scripts/eval/yamls/eval_gauntlet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -133,32 +133,32 @@ eval_gauntlet:
- name: boolq
num_fewshot: 10
random_baseline: 0.5
- name: programming
benchmarks:
- name: human_eval
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_cpp
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_js
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_return_simple
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_return_complex
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_25
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_50
num_fewshot: 0
random_baseline: 0.0
- name: human_eval_75
num_fewshot: 0
random_baseline: 0.0
# - name: programming
# benchmarks:
# - name: human_eval
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_cpp
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_js
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_return_simple
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_return_complex
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_25
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_50
# num_fewshot: 0
# random_baseline: 0.0
# - name: human_eval_75
# num_fewshot: 0
# random_baseline: 0.0
- name: world_knowledge_lm_task_subscore
benchmarks:
- name: jeopardy
Expand Down Expand Up @@ -258,8 +258,3 @@ eval_gauntlet:
- name: squad
num_fewshot: 10
random_baseline: 0
- name: programming_lite
benchmarks:
- name: human_eval
num_fewshot: 0
random_baseline: 0.0
128 changes: 64 additions & 64 deletions scripts/eval/yamls/tasks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -173,67 +173,67 @@ icl_tasks:
num_fewshot: [10]
icl_task_type: multiple_choice
continuation_delimiter: "\nAnswer: " # this separates questions from answers
-
label: human_eval
dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_cpp
dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_js
dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_return_simple
dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_return_complex
dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_25
dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_50
dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
-
label: human_eval_75
dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
num_fewshot: [0]
pass_at_k: 1
num_beams: 20
batch_size: 1
icl_task_type: code_evaluation
# -
# label: human_eval
# dataset_uri: eval/local_data/programming/human_eval.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_cpp
# dataset_uri: eval/local_data/programming/processed_human_eval_cpp.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_js
# dataset_uri: eval/local_data/programming/processed_human_eval_js.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_return_simple
# dataset_uri: eval/local_data/programming/human_eval_return_simple.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_return_complex
# dataset_uri: eval/local_data/programming/human_eval_return_complex.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_25
# dataset_uri: eval/local_data/programming/human_eval-0.25.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_50
# dataset_uri: eval/local_data/programming/human_eval-0.5.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation
# -
# label: human_eval_75
# dataset_uri: eval/local_data/programming/human_eval-0.75.jsonl # ADD YOUR OWN DATASET URI
# num_fewshot: [0]
# pass_at_k: 1
# num_beams: 20
# batch_size: 1
# icl_task_type: code_evaluation

0 comments on commit be467ae

Please sign in to comment.