diff --git a/lm_eval/tasks/mmlu/generative/_mmlu.yaml b/lm_eval/tasks/mmlu/generative/_mmlu.yaml index 3cff0f12f5..1a63611bdb 100644 --- a/lm_eval/tasks/mmlu/generative/_mmlu.yaml +++ b/lm_eval/tasks/mmlu/generative/_mmlu.yaml @@ -26,7 +26,8 @@ task: - metric: acc weight_by_size: True aggregate_metric_list: - - metric: acc + - aggregation: mean + metric: exact_match weight_by_size: True metadata: version: 2 diff --git a/lm_eval/tasks/mmlu_pro/_mmlu_pro.yaml b/lm_eval/tasks/mmlu_pro/_mmlu_pro.yaml new file mode 100644 index 0000000000..84ae47e76d --- /dev/null +++ b/lm_eval/tasks/mmlu_pro/_mmlu_pro.yaml @@ -0,0 +1,23 @@ +group: mmlu_pro +task: + - mmlu_pro_biology + - mmlu_pro_business + - mmlu_pro_chemistry + - mmlu_pro_computer_science + - mmlu_pro_economics + - mmlu_pro_engineering + - mmlu_pro_health + - mmlu_pro_history + - mmlu_pro_law + - mmlu_pro_math + - mmlu_pro_other + - mmlu_pro_philosophy + - mmlu_pro_physics + - mmlu_pro_psychology +aggregate_metric_list: + - aggregation: mean + metric: exact_match + weight_by_size: true + filter_list: custom-extract +metadata: + version: 1.0 \ No newline at end of file