feat: reconstruct evaluation framework (#33)

PKU-Alignment · Aug 6, 2024 · e830052 · e830052
1 parent db964d1
commit e830052
Show file tree

Hide file tree

Showing 244 changed files with 10,766 additions and 3,217 deletions.
diff --git a/align_anything/configs/evaluation/benchmarks/arc.yaml b/align_anything/configs/evaluation/benchmarks/arc.yaml
@@ -0,0 +1,47 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 5
+    # Chain of Thought
+    cot: false
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: [ARC-Challenge, ARC-Easy]
+    # Task directory
+    task_dir: allenai/ai2_arc
+    # Evaluation split
+    split: test
+
+  # Model configurations
+  model_cfgs:
+    model_id: Meta-Llama-3-8B-Instruct
+    # Pretrained model name or path
+    model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+    # Chat template
+    chat_template: Llama3
+    # The max token length
+    model_max_length: 2048
diff --git a/align_anything/configs/evaluation/benchmarks/bbh.yaml b/align_anything/configs/evaluation/benchmarks/bbh.yaml
@@ -0,0 +1,48 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 3
+    # Use Chain of Thought
+    cot: True
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: [boolean_expressions, dyck_languages, causal_judgement, date_understanding, disambiguation_qa, formal_fallacies, geometric_shapes, hyperbaton, logical_deduction_five_objects, logical_deduction_seven_objects, logical_deduction_three_objects,  movie_recommendation, multistep_arithmetic_two, navigate, object_counting, penguins_in_a_table, reasoning_about_colored_objects, ruin_names, salient_translation_error_detection, snarks, sports_understanding, temporal_sequences, tracking_shuffled_objects_five_objects, tracking_shuffled_objects_seven_objects, tracking_shuffled_objects_three_objects, web_of_lies, word_sorting]
+    # Task directory
+    task_dir: lukaemon/bbh
+    # Evaluation split
+    split: test
+
+  # Model configurations
+  model_cfgs:
+    model_id: Meta-Llama-3-8B-Instruct
+    # Pretrained model name or path
+    model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+    # Chat template
+    chat_template: Llama3
+    # The max token length
+    model_max_length: 2048
+
diff --git a/align_anything/configs/evaluation/benchmarks/belebele.yaml b/align_anything/configs/evaluation/benchmarks/belebele.yaml
@@ -0,0 +1,49 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 0
+    # Use Chain of Thought
+    cot: false
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: ''
+    # Task directory
+    task_dir: facebook/belebele
+    # Evaluation split
+    split: 'eng_Latn'
+    # Candidate labels
+    candidate_labels: ["A", "B", "C", "D"]
+
+  # Model configurations
+  model_cfgs:
+    model_id: Meta-Llama-3-8B-Instruct
+    # Pretrained model name or path
+    model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+    # Chat template
+    chat_template: Llama3
+    # The max token length
+    model_max_length: 2048
diff --git a/align_anything/configs/evaluation/benchmarks/cmmlu.yaml b/align_anything/configs/evaluation/benchmarks/cmmlu.yaml
@@ -0,0 +1,49 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 5
+    # Use Chain of Thought
+    cot: false
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: [marxist_theory, college_law, global_facts, international_law, jurisprudence, world_religions, logical, professional_law, philosophy, world_history, arts]
+    # Task directory
+    task_dir: haonan-li/cmmlu
+    # Evaluation split
+    split: test
+    # Candidate labels
+    candidate_labels: ["A", "B", "C", "D"]
+
+  # Model configurations
+  model_cfgs:
+    model_id: Meta-Llama-3-8B-Instruct
+    # Pretrained model name or path
+    model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+    # Chat template
+    chat_template: Llama3
+    # The max token length
+    model_max_length: 2048
diff --git a/align_anything/configs/evaluation/benchmarks/gsm8k.yaml b/align_anything/configs/evaluation/benchmarks/gsm8k.yaml
@@ -0,0 +1,47 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 4
+    # Use Chain of Thought
+    cot: true
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: main
+    # Task directory
+    task_dir: openai/gsm8k
+    # Evaluation split
+    split: test
+
+  # Model configurations
+  model_cfgs:
+    model_id: Meta-Llama-3-8B-Instruct
+    # Pretrained model name or path
+    model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+    # Chat template
+    chat_template: Llama3
+    # The max token length
+    model_max_length: 2048
diff --git a/align_anything/configs/evaluation/benchmarks/humaneval.yaml b/align_anything/configs/evaluation/benchmarks/humaneval.yaml
@@ -0,0 +1,47 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 8
+    # Use Chain of Thought
+    cot: false
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: ''
+    # Task directory
+    task_dir: openai/openai_humaneval
+    # Evaluation split
+    split: 'test'
+
+  # Model configurations
+  model_cfgs:
+    model_id: Qwen2-7B-Instruct
+    # Pretrained model name or path
+    model_name_or_path: Qwen/Qwen2-7B-Instruct
+    # Chat template
+    chat_template: Qwen
+    # The max token length
+    model_max_length: 2048
diff --git a/align_anything/configs/evaluation/benchmarks/mme.yaml b/align_anything/configs/evaluation/benchmarks/mme.yaml
@@ -0,0 +1,51 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 0
+    # Evaluation method
+    action: generation
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: default
+    # Task directory
+    task_dir: lmms-lab/MME
+    # Evaluation split
+    split: test
+
+  # Model configurations
+  model_cfgs:
+    model_id: llava-1.5-7b-hf
+    # Pretrained model name or path
+    model_name_or_path: llava-hf/llava-1.5-7b-hf
+    # Chat template
+    chat_template: LLAVA
+    # Whether to trust remote code
+    trust_remote_code: True
+    # The max token length
+    max_length: 1024
+    # The max new tokens for generation
+    max_new_tokens: 512
diff --git a/align_anything/configs/evaluation/benchmarks/mmlu-pro.yaml b/align_anything/configs/evaluation/benchmarks/mmlu-pro.yaml
@@ -0,0 +1,47 @@
+# Copyright 2024 PKU-Alignment Team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+infer_cfgs:
+  # The deepspeed configuration
+  ds_cfgs: ds_z3_config.json
+  vllm_cfgs: vllm_basic.json
+
+default:
+  # Evaluation configurations
+  eval_cfgs:
+    # Output directory name
+    output_dir: null
+    # Num shot
+    n_shot: 5
+    # Use Chain of Thought
+    cot: false
+  # Configuration for data
+  data_cfgs:
+    # Task name
+    task: ''
+    # Task directory
+    task_dir: TIGER-Lab/MMLU-Pro
+    # Evaluation split
+    split: test
+
+  # Model configurations
+  model_cfgs:
+    model_id: Meta-Llama-3-8B-Instruct
+    # Pretrained model name or path
+    model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+    # Chat template
+    chat_template: Llama3
+    # The max token length
+    model_max_length: 2048