Skip to content

Commit

Permalink
feat: reconstruct evaluation framework (#33)
Browse files Browse the repository at this point in the history
  • Loading branch information
Kass123777 authored Aug 6, 2024
1 parent db964d1 commit e830052
Show file tree
Hide file tree
Showing 244 changed files with 10,766 additions and 3,217 deletions.
47 changes: 47 additions & 0 deletions align_anything/configs/evaluation/benchmarks/arc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 5
# Chain of Thought
cot: false
# Configuration for data
data_cfgs:
# Task name
task: [ARC-Challenge, ARC-Easy]
# Task directory
task_dir: allenai/ai2_arc
# Evaluation split
split: test

# Model configurations
model_cfgs:
model_id: Meta-Llama-3-8B-Instruct
# Pretrained model name or path
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
# Chat template
chat_template: Llama3
# The max token length
model_max_length: 2048
48 changes: 48 additions & 0 deletions align_anything/configs/evaluation/benchmarks/bbh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 3
# Use Chain of Thought
cot: True
# Configuration for data
data_cfgs:
# Task name
task: [boolean_expressions, dyck_languages, causal_judgement, date_understanding, disambiguation_qa, formal_fallacies, geometric_shapes, hyperbaton, logical_deduction_five_objects, logical_deduction_seven_objects, logical_deduction_three_objects, movie_recommendation, multistep_arithmetic_two, navigate, object_counting, penguins_in_a_table, reasoning_about_colored_objects, ruin_names, salient_translation_error_detection, snarks, sports_understanding, temporal_sequences, tracking_shuffled_objects_five_objects, tracking_shuffled_objects_seven_objects, tracking_shuffled_objects_three_objects, web_of_lies, word_sorting]
# Task directory
task_dir: lukaemon/bbh
# Evaluation split
split: test

# Model configurations
model_cfgs:
model_id: Meta-Llama-3-8B-Instruct
# Pretrained model name or path
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
# Chat template
chat_template: Llama3
# The max token length
model_max_length: 2048

49 changes: 49 additions & 0 deletions align_anything/configs/evaluation/benchmarks/belebele.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 0
# Use Chain of Thought
cot: false
# Configuration for data
data_cfgs:
# Task name
task: ''
# Task directory
task_dir: facebook/belebele
# Evaluation split
split: 'eng_Latn'
# Candidate labels
candidate_labels: ["A", "B", "C", "D"]

# Model configurations
model_cfgs:
model_id: Meta-Llama-3-8B-Instruct
# Pretrained model name or path
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
# Chat template
chat_template: Llama3
# The max token length
model_max_length: 2048
49 changes: 49 additions & 0 deletions align_anything/configs/evaluation/benchmarks/cmmlu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 5
# Use Chain of Thought
cot: false
# Configuration for data
data_cfgs:
# Task name
task: [marxist_theory, college_law, global_facts, international_law, jurisprudence, world_religions, logical, professional_law, philosophy, world_history, arts]
# Task directory
task_dir: haonan-li/cmmlu
# Evaluation split
split: test
# Candidate labels
candidate_labels: ["A", "B", "C", "D"]

# Model configurations
model_cfgs:
model_id: Meta-Llama-3-8B-Instruct
# Pretrained model name or path
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
# Chat template
chat_template: Llama3
# The max token length
model_max_length: 2048
47 changes: 47 additions & 0 deletions align_anything/configs/evaluation/benchmarks/gsm8k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 4
# Use Chain of Thought
cot: true
# Configuration for data
data_cfgs:
# Task name
task: main
# Task directory
task_dir: openai/gsm8k
# Evaluation split
split: test

# Model configurations
model_cfgs:
model_id: Meta-Llama-3-8B-Instruct
# Pretrained model name or path
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
# Chat template
chat_template: Llama3
# The max token length
model_max_length: 2048
47 changes: 47 additions & 0 deletions align_anything/configs/evaluation/benchmarks/humaneval.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 8
# Use Chain of Thought
cot: false
# Configuration for data
data_cfgs:
# Task name
task: ''
# Task directory
task_dir: openai/openai_humaneval
# Evaluation split
split: 'test'

# Model configurations
model_cfgs:
model_id: Qwen2-7B-Instruct
# Pretrained model name or path
model_name_or_path: Qwen/Qwen2-7B-Instruct
# Chat template
chat_template: Qwen
# The max token length
model_max_length: 2048
51 changes: 51 additions & 0 deletions align_anything/configs/evaluation/benchmarks/mme.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 0
# Evaluation method
action: generation
# Configuration for data
data_cfgs:
# Task name
task: default
# Task directory
task_dir: lmms-lab/MME
# Evaluation split
split: test

# Model configurations
model_cfgs:
model_id: llava-1.5-7b-hf
# Pretrained model name or path
model_name_or_path: llava-hf/llava-1.5-7b-hf
# Chat template
chat_template: LLAVA
# Whether to trust remote code
trust_remote_code: True
# The max token length
max_length: 1024
# The max new tokens for generation
max_new_tokens: 512
47 changes: 47 additions & 0 deletions align_anything/configs/evaluation/benchmarks/mmlu-pro.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

infer_cfgs:
# The deepspeed configuration
ds_cfgs: ds_z3_config.json
vllm_cfgs: vllm_basic.json

default:
# Evaluation configurations
eval_cfgs:
# Output directory name
output_dir: null
# Num shot
n_shot: 5
# Use Chain of Thought
cot: false
# Configuration for data
data_cfgs:
# Task name
task: ''
# Task directory
task_dir: TIGER-Lab/MMLU-Pro
# Evaluation split
split: test

# Model configurations
model_cfgs:
model_id: Meta-Llama-3-8B-Instruct
# Pretrained model name or path
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
# Chat template
chat_template: Llama3
# The max token length
model_max_length: 2048
Loading

0 comments on commit e830052

Please sign in to comment.