diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 6c500ee5da..09a6dd797c 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -11,6 +11,8 @@ | [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic | | [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese | | [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English | +| [arabic_leaderboard_complete](arabic_leaderboard_complete/README.md) | A full version of the tasks in the Open Arabic LLM Leaderboard, focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) | +| [arabic_leaderboard_light](arabic_leaderboard_light/README.md) | A light version of the tasks in the Open Arabic LLM Leaderboard (i.e., 10% samples of the test set in the original benchmarks), focusing on the evaluation of models that reflect the characteristics of Arabic language understanding and comprehension, culture, and heritage. Note that some of these tasks are machine-translated. | Arabic (Some MT) | | [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic | | [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English | | [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | diff --git a/lm_eval/tasks/arabic_leaderboard_complete/README.md b/lm_eval/tasks/arabic_leaderboard_complete/README.md new file mode 100644 index 0000000000..8052abcbd6 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/README.md @@ -0,0 +1,254 @@ +# Arabic Leaderboard + + +Title: Open Arabic LLM Leaderboard + +The Open Arabic LLM Leaderboard evaluates language models on a large number of different evaluation tasks that reflect the characteristics of the Arabic language and culture. +The benchmark uses several datasets, most of them translated to Arabic, and validated by native Arabic speakers. They also used benchmarks from other papers or prepared benchmarks from scratch natively for Arabic. + +Homepage: https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard + +### Citation + +``` + +@misc{OALL, + author = {Elfilali, Ali and Alobeidli, Hamza and Fourrier, Clémentine and Boussaha, Basma El Amel and Cojocaru, Ruxandra and Habib, Nathan and Hacid, Hakim}, + title = {Open Arabic LLM Leaderboard}, + year = {2024}, + publisher = {OALL}, + howpublished = "\url{https://huggingface.co/spaces/OALL/Open-Arabic-LLM-Leaderboard}" +} + +@inproceedings{almazrouei-etal-2023-alghafa, + title = "{A}l{G}hafa Evaluation Benchmark for {A}rabic Language Models", + author = "Almazrouei, Ebtesam and + Cojocaru, Ruxandra and + Baldo, Michele and + Malartic, Quentin and + Alobeidli, Hamza and + Mazzotta, Daniele and + Penedo, Guilherme and + Campesan, Giulia and + Farooq, Mugariya and + Alhammadi, Maitha and + Launay, Julien and + Noune, Badreddine", + editor = "Sawaf, Hassan and + El-Beltagy, Samhaa and + Zaghouani, Wajdi and + Magdy, Walid and + Abdelali, Ahmed and + Tomeh, Nadi and + Abu Farha, Ibrahim and + Habash, Nizar and + Khalifa, Salam and + Keleg, Amr and + Haddad, Hatem and + Zitouni, Imed and + Mrini, Khalil and + Almatham, Rawan", + booktitle = "Proceedings of ArabicNLP 2023", + month = dec, + year = "2023", + address = "Singapore (Hybrid)", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/2023.arabicnlp-1.21", + doi = "10.18653/v1/2023.arabicnlp-1.21", + pages = "244--275", + abstract = "Recent advances in the space of Arabic large language models have opened up a wealth of potential practical applications. From optimal training strategies, large scale data acquisition and continuously increasing NLP resources, the Arabic LLM landscape has improved in a very short span of time, despite being plagued by training data scarcity and limited evaluation resources compared to English. In line with contributing towards this ever-growing field, we introduce AlGhafa, a new multiple-choice evaluation benchmark for Arabic LLMs. For showcasing purposes, we train a new suite of models, including a 14 billion parameter model, the largest monolingual Arabic decoder-only model to date. We use a collection of publicly available datasets, as well as a newly introduced HandMade dataset consisting of 8 billion tokens. Finally, we explore the quantitative and qualitative toxicity of several Arabic models, comparing our models to existing public Arabic LLMs.", +} +@misc{huang2023acegpt, + title={AceGPT, Localizing Large Language Models in Arabic}, + author={Huang Huang and Fei Yu and Jianqing Zhu and Xuening Sun and Hao Cheng and Dingjie Song and Zhihong Chen and Abdulmohsen Alharthi and Bang An and Ziche Liu and Zhiyi Zhang and Junying Chen and Jianquan Li and Benyou Wang and Lian Zhang and Ruoyu Sun and Xiang Wan and Haizhou Li and Jinchao Xu}, + year={2023}, + eprint={2309.12053}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +@misc{lighteval, + author = {Fourrier, Clémentine and Habib, Nathan and Wolf, Thomas and Tunstall, Lewis}, + title = {LightEval: A lightweight framework for LLM evaluation}, + year = {2023}, + version = {0.3.0}, + url = {https://github.com/huggingface/lighteval} +} +``` + +### Groups and Tasks + +* `arabic_leaderboard_alghafa`: A multiple-choice evaluation benchmark for zero- and few-shot evaluation of Arabic LLMs prepared from scratch natively for Arabic. + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf + * You can find the list of the tasks as follows: + * `arabic_leaderboard_alghafa_mcq_exams_test_ar` + * `arabic_leaderboard_alghafa_meta_ar_dialects` + * `arabic_leaderboard_alghafa_meta_ar_msa` + * `arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task` + * `arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task` + * `arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task` + * `arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task` + * `arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task` + * `arabic_leaderboard_alghafa_multiple_choice_sentiment_task` +* `arabic_leaderboard_arabic_exams`: A question answering benchmark for high school examinations in different school subjects that requires knowledge and reasoning in different languages in multiple domains. + * Paper: https://aclanthology.org/2020.emnlp-main.438.pdf +* `arabic_leaderboard_arabic_mmlu`: A multi-task language understanding benchmark for the Arabic language, sourced from school exams across diverse educational levels in different countries with native speakers in the region. + The data comprises multiple choice questions in 40 tasks. + * Paper: https://arxiv.org/pdf/2402.12840 + * You can find the list of the tasks as follows: + * `arabic_leaderboard_arabic_mmlu_abstract_algebra` + * `arabic_leaderboard_arabic_mmlu_anatomy` + * `arabic_leaderboard_arabic_mmlu_astronomy` + * `arabic_leaderboard_arabic_mmlu_business_ethics` + * `arabic_leaderboard_arabic_mmlu_clinical_knowledge` + * `arabic_leaderboard_arabic_mmlu_college_biology` + * `arabic_leaderboard_arabic_mmlu_college_chemistry` + * `arabic_leaderboard_arabic_mmlu_college_computer_science` + * `arabic_leaderboard_arabic_mmlu_college_mathematics` + * `arabic_leaderboard_arabic_mmlu_college_medicine` + * `arabic_leaderboard_arabic_mmlu_college_physics` + * `arabic_leaderboard_arabic_mmlu_computer_security` + * `arabic_leaderboard_arabic_mmlu_conceptual_physics` + * `arabic_leaderboard_arabic_mmlu_econometrics` + * `arabic_leaderboard_arabic_mmlu_electrical_engineering` + * `arabic_leaderboard_arabic_mmlu_elementary_mathematics` + * `arabic_leaderboard_arabic_mmlu_formal_logic` + * `arabic_leaderboard_arabic_mmlu_global_facts` + * `arabic_leaderboard_arabic_mmlu_high_school_biology` + * `arabic_leaderboard_arabic_mmlu_high_school_chemistry` + * `arabic_leaderboard_arabic_mmlu_high_school_computer_science` + * `arabic_leaderboard_arabic_mmlu_high_school_european_history` + * `arabic_leaderboard_arabic_mmlu_high_school_geography` + * `arabic_leaderboard_arabic_mmlu_high_school_government_and_politics` + * `arabic_leaderboard_arabic_mmlu_high_school_macroeconomics` + * `arabic_leaderboard_arabic_mmlu_high_school_mathematics` + * `arabic_leaderboard_arabic_mmlu_high_school_microeconomics` + * `arabic_leaderboard_arabic_mmlu_high_school_physics` + * `arabic_leaderboard_arabic_mmlu_high_school_psychology` + * `arabic_leaderboard_arabic_mmlu_high_school_statistics` + * `arabic_leaderboard_arabic_mmlu_high_school_us_history` + * `arabic_leaderboard_arabic_mmlu_high_school_us_history` + * `arabic_leaderboard_arabic_mmlu_human_aging` + * `arabic_leaderboard_arabic_mmlu_human_sexuality` + * `arabic_leaderboard_arabic_mmlu_international_law` + * `arabic_leaderboard_arabic_mmlu_jurisprudence` + * `arabic_leaderboard_arabic_mmlu_logical_fallacies` + * `arabic_leaderboard_arabic_mmlu_machine_learning` + * `arabic_leaderboard_arabic_mmlu_management` + * `arabic_leaderboard_arabic_mmlu_marketing` + * `arabic_leaderboard_arabic_mmlu_medical_genetics` + * `arabic_leaderboard_arabic_mmlu_miscellaneous` + * `arabic_leaderboard_arabic_mmlu_moral_disputes` + * `arabic_leaderboard_arabic_mmlu_moral_scenarios` + * `arabic_leaderboard_arabic_mmlu_nutrition` + * `arabic_leaderboard_arabic_mmlu_philosophy` + * `arabic_leaderboard_arabic_mmlu_prehistory` + * `arabic_leaderboard_arabic_mmlu_professional_accounting` + * `arabic_leaderboard_arabic_mmlu_professional_law` + * `arabic_leaderboard_arabic_mmlu_professional_medicine` + * `arabic_leaderboard_arabic_mmlu_professional_psychology` + * `arabic_leaderboard_arabic_mmlu_public_relations` + * `arabic_leaderboard_arabic_mmlu_security_studies` + * `arabic_leaderboard_arabic_mmlu_sociology` + * `arabic_leaderboard_arabic_mmlu_us_foreign_policy` + * `arabic_leaderboard_arabic_mmlu_virology` + * `arabic_leaderboard_arabic_mmlu_world_religions` +* `arabic_leaderboard_arabic_mt_arc_challenge`: AI2 Reasoning Challenge (ARC) is a multiple-choice question task. The dataset contains only natural, grade-school science questions, + written for human tests. The challenge set contains only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurence algorithm. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_arc_easy`: This dataset is the same as `arabic_arc_challenge`, except it is not from the challenge set. + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_boolq`: A true/false questions dataset that contains the columns passage, question, and the answer (i.e., true/false). (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_copa`: Choice Of Plausible Alternatives (COPA) is a multiple-choice question dataset, which involves open-domain commonsense causal reasoning. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_hellaswag`: The tesk is to choose the next set of sentences, based on the given candidates. The tasks involve reading comprehension and information retrieval challenges + by testing the abilities of the models on basic knowledge (i.e., from 3rd grade to 9th) and commonsense inference. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_mmlu`: A multiple-choice question answering dataset from various branches of knowledge including humanities, social sciences, hard sciences, and other areas. The examples in the English dataset are translated into Arabic using ChatGPT with a translation prompt. + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_openbook_qa`: A multiple-choice openbook question answering dataset that requires external knowledge and reasoning. The open book that comes with these questions is + based on elementary level science facts. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_piqa`: Physical Interaction Question Answering (PIQA) is a multiple-choice question answering based on physical commonsense reasoning. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_race`: A multiple-choice questions dataset to assess reading comprehension tasks based on English exams in China - designed for middle school and high school students + (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_sciq`: A multiple-choice Science Question Answering task to assess understanding of scientific concepts about physics, chemistry, and biology. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_arabic_mt_toxigen`: This benchmark consists of tasks designed to evaluate language models and classify input text as hateful or not hateful. (machine translated benchmark - part of the Alghafa Arabic translated LLM benchmark) + * Paper: https://aclanthology.org/2023.arabicnlp-1.21.pdf +* `arabic_leaderboard_acva`: Arabic-Culture-Value-Alignment (ACVA) is a yes/no question dataset, generated by GPT3.5 Turbo from Arabic topics to assess model alignment with Arabic values and cultures. + * Paper: https://arxiv.org/pdf/2309.12053 + * You can find the list of the tasks as follows: + - `arabic_leaderboard_acva_Algeria` + - `arabic_leaderboard_acva_Ancient_Egypt` + - `arabic_leaderboard_acva_Arab_Empire` + - `arabic_leaderboard_acva_Arabic_Architecture` + - `arabic_leaderboard_acva_Arabic_Art` + - `arabic_leaderboard_acva_Arabic_Astronomy` + - `arabic_leaderboard_acva_Arabic_Calligraphy` + - `arabic_leaderboard_acva_Arabic_Ceremony` + - `arabic_leaderboard_acva_Arabic_Clothing` + - `arabic_leaderboard_acva_Arabic_Culture` + - `arabic_leaderboard_acva_Arabic_Food` + - `arabic_leaderboard_acva_Arabic_Funeral` + - `arabic_leaderboard_acva_Arabic_Geography` + - `arabic_leaderboard_acva_Arabic_History` + - `arabic_leaderboard_acva_Arabic_Language_Origin` + - `arabic_leaderboard_acva_Arabic_Literature` + - `arabic_leaderboard_acva_Arabic_Math` + - `arabic_leaderboard_acva_Arabic_Medicine` + - `arabic_leaderboard_acva_Arabic_Music` + - `arabic_leaderboard_acva_Arabic_Ornament` + - `arabic_leaderboard_acva_Arabic_Philosophy` + - `arabic_leaderboard_acva_Arabic_Physics_and_Chemistry` + - `arabic_leaderboard_acva_Arabic_Wedding` + - `arabic_leaderboard_acva_Bahrain` + - `arabic_leaderboard_acva_Comoros` + - `arabic_leaderboard_acva_Egypt_modern` + - `arabic_leaderboard_acva_InfluenceFromAncientEgypt` + - `arabic_leaderboard_acva_InfluenceFromByzantium` + - `arabic_leaderboard_acva_InfluenceFromChina` + - `arabic_leaderboard_acva_InfluenceFromGreece` + - `arabic_leaderboard_acva_InfluenceFromIslam` + - `arabic_leaderboard_acva_InfluenceFromPersia` + - `arabic_leaderboard_acva_InfluenceFromRome` + - `arabic_leaderboard_acva_Iraq` + - `arabic_leaderboard_acva_Islam_Education` + - `arabic_leaderboard_acva_Islam_branches_and_schools` + - `arabic_leaderboard_acva_Islamic_law_system` + - `arabic_leaderboard_acva_Jordan` + - `arabic_leaderboard_acva_Kuwait` + - `arabic_leaderboard_acva_Lebanon` + - `arabic_leaderboard_acva_Libya` + - `arabic_leaderboard_acva_Mauritania` + - `arabic_acva_Mesopotamia_civilization` + - `arabic_leaderboard_acva_Morocco` + - `arabic_leaderboard_acva_Oman` + - `arabic_leaderboard_acva_Palestine` + - `arabic_leaderboard_acva_Qatar` + - `arabic_leaderboard_acva_Saudi_Arabia` + - `arabic_leaderboard_acva_Somalia` + - `arabic_leaderboard_acva_Sudan` + - `arabic_leaderboard_acva_Syria` + - `arabic_leaderboard_acva_Tunisia` + - `arabic_leaderboard_acva_United_Arab_Emirates` + - `arabic_leaderboard_acva_Yemen` + - `arabic_leaderboard_acva_communication` + - `arabic_leaderboard_acva_computer_and_phone` + - `arabic_leaderboard_acva_daily_life` + - `arabic_leaderboard_acva_entertainment` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa.yaml new file mode 100644 index 0000000000..6f0014d812 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa.yaml @@ -0,0 +1,23 @@ +group: arabic_leaderboard_alghafa +task: + - arabic_leaderboard_alghafa_mcq_exams_test_ar + - arabic_leaderboard_alghafa_meta_ar_dialects + - arabic_leaderboard_alghafa_meta_ar_msa + - arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task + - arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task + - arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task + - arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task + - arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task + - arabic_leaderboard_alghafa_multiple_choice_sentiment_task + + + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_mcq_exams_test_ar.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_mcq_exams_test_ar.yaml new file mode 100644 index 0000000000..e436e29574 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_mcq_exams_test_ar.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_mcq_exams_test_ar +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: mcq_exams_test_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_dialects.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_dialects.yaml new file mode 100644 index 0000000000..f19c2ecefe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_dialects.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_meta_ar_dialects +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: meta_ar_dialects +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_msa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_msa.yaml new file mode 100644 index 0000000000..0d95ec5a06 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_meta_ar_msa.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_meta_ar_msa +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: meta_ar_msa +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task.yaml new file mode 100644 index 0000000000..46d2b6abcf --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: multiple_choice_facts_truefalse_balanced_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task.yaml new file mode 100644 index 0000000000..13150c690b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: multiple_choice_grounded_statement_soqal_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task.yaml new file mode 100644 index 0000000000..3a17548f8e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: multiple_choice_grounded_statement_xglue_mlqa_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task.yaml new file mode 100644 index 0000000000..8e34a45c7a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: multiple_choice_rating_sentiment_no_neutral_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task.yaml new file mode 100644 index 0000000000..b31748516a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: multiple_choice_rating_sentiment_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_sentiment_task.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_sentiment_task.yaml new file mode 100644 index 0000000000..191b26ba0a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/arabic_leaderboard_alghafa_multiple_choice_sentiment_task.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_sentiment_task +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Native +dataset_name: multiple_choice_sentiment_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_alghafa/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_exams.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_exams.yaml new file mode 100644 index 0000000000..edc20fe4b9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_exams.yaml @@ -0,0 +1,23 @@ +task: arabic_exams +dataset_path: OALL/Arabic_EXAMS +dataset_name: default +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_leaderboard_arabic_exams.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_leaderboard_arabic_exams.yaml new file mode 100644 index 0000000000..2bf77eb361 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/arabic_leaderboard_arabic_exams.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_exams +task: + - arabic_exams + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/utils.py new file mode 100644 index 0000000000..72af1c40fe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_exams/utils.py @@ -0,0 +1,33 @@ +import datasets +import numpy as np + + +# fmt: off +LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] +# fmt: on + + +# fmt: off +LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] +# fmt: on + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + topic = doc["subject"] + question = doc["question"] + choices = [doc["A"], doc["B"], doc["C"], doc["D"]] + choices_formatted = [ + f" {LETTER_INDICES_AR[i]}) {choice}\n" for i, choice in enumerate(choices) + ] + answer = doc["answer"] + answer_index = LETTER_INDICES.index(answer) + + instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" + query = f"{instruction}السؤال: {question}\n" + query += "\n".join(choices_formatted) + query += "\nالإجابة:" + + return {"query": query, "choices": LETTER_INDICES_AR[:4], "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu.yaml new file mode 100644 index 0000000000..ad2751bf32 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu.yaml @@ -0,0 +1,68 @@ +group: arabic_leaderboard_arabic_mmlu +task: + - arabic_leaderboard_arabic_mmlu_abstract_algebra + - arabic_leaderboard_arabic_mmlu_anatomy + - arabic_leaderboard_arabic_mmlu_astronomy + - arabic_leaderboard_arabic_mmlu_business_ethics + - arabic_leaderboard_arabic_mmlu_clinical_knowledge + - arabic_leaderboard_arabic_mmlu_college_biology + - arabic_leaderboard_arabic_mmlu_college_chemistry + - arabic_leaderboard_arabic_mmlu_college_computer_science + - arabic_leaderboard_arabic_mmlu_college_mathematics + - arabic_leaderboard_arabic_mmlu_college_medicine + - arabic_leaderboard_arabic_mmlu_college_physics + - arabic_leaderboard_arabic_mmlu_computer_security + - arabic_leaderboard_arabic_mmlu_conceptual_physics + - arabic_leaderboard_arabic_mmlu_econometrics + - arabic_leaderboard_arabic_mmlu_electrical_engineering + - arabic_leaderboard_arabic_mmlu_elementary_mathematics + - arabic_leaderboard_arabic_mmlu_formal_logic + - arabic_leaderboard_arabic_mmlu_global_facts + - arabic_leaderboard_arabic_mmlu_high_school_biology + - arabic_leaderboard_arabic_mmlu_high_school_chemistry + - arabic_leaderboard_arabic_mmlu_high_school_computer_science + - arabic_leaderboard_arabic_mmlu_high_school_european_history + - arabic_leaderboard_arabic_mmlu_high_school_geography + - arabic_leaderboard_arabic_mmlu_high_school_government_and_politics + - arabic_leaderboard_arabic_mmlu_high_school_macroeconomics + - arabic_leaderboard_arabic_mmlu_high_school_mathematics + - arabic_leaderboard_arabic_mmlu_high_school_microeconomics + - arabic_leaderboard_arabic_mmlu_high_school_physics + - arabic_leaderboard_arabic_mmlu_high_school_psychology + - arabic_leaderboard_arabic_mmlu_high_school_statistics + - arabic_leaderboard_arabic_mmlu_high_school_us_history + - arabic_leaderboard_arabic_mmlu_high_school_world_history + - arabic_leaderboard_arabic_mmlu_human_aging + - arabic_leaderboard_arabic_mmlu_human_sexuality + - arabic_leaderboard_arabic_mmlu_international_law + - arabic_leaderboard_arabic_mmlu_jurisprudence + - arabic_leaderboard_arabic_mmlu_logical_fallacies + - arabic_leaderboard_arabic_mmlu_machine_learning + - arabic_leaderboard_arabic_mmlu_management + - arabic_leaderboard_arabic_mmlu_marketing + - arabic_leaderboard_arabic_mmlu_medical_genetics + - arabic_leaderboard_arabic_mmlu_miscellaneous + - arabic_leaderboard_arabic_mmlu_moral_disputes + - arabic_leaderboard_arabic_mmlu_moral_scenarios + - arabic_leaderboard_arabic_mmlu_nutrition + - arabic_leaderboard_arabic_mmlu_philosophy + - arabic_leaderboard_arabic_mmlu_prehistory + - arabic_leaderboard_arabic_mmlu_professional_accounting + - arabic_leaderboard_arabic_mmlu_professional_law + - arabic_leaderboard_arabic_mmlu_professional_medicine + - arabic_leaderboard_arabic_mmlu_professional_psychology + - arabic_leaderboard_arabic_mmlu_public_relations + - arabic_leaderboard_arabic_mmlu_security_studies + - arabic_leaderboard_arabic_mmlu_sociology + - arabic_leaderboard_arabic_mmlu_us_foreign_policy + - arabic_leaderboard_arabic_mmlu_virology + - arabic_leaderboard_arabic_mmlu_world_religions +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_abstract_algebra.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_abstract_algebra.yaml new file mode 100644 index 0000000000..3d0946be2c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_abstract_algebra.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_abstract_algebra +dataset_path: OALL/Arabic_MMLU +dataset_name: abstract_algebra +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_anatomy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_anatomy.yaml new file mode 100644 index 0000000000..24af11dd2f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_anatomy.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_anatomy +dataset_path: OALL/Arabic_MMLU +dataset_name: anatomy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_astronomy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_astronomy.yaml new file mode 100644 index 0000000000..0aa9680906 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_astronomy.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_astronomy +dataset_path: OALL/Arabic_MMLU +dataset_name: astronomy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_business_ethics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_business_ethics.yaml new file mode 100644 index 0000000000..18c941e422 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_business_ethics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_business_ethics +dataset_path: OALL/Arabic_MMLU +dataset_name: business_ethics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_clinical_knowledge.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_clinical_knowledge.yaml new file mode 100644 index 0000000000..9460403c98 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_clinical_knowledge.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_clinical_knowledge +dataset_path: OALL/Arabic_MMLU +dataset_name: clinical_knowledge +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_biology.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_biology.yaml new file mode 100644 index 0000000000..2f34d342d6 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_biology.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_biology +dataset_path: OALL/Arabic_MMLU +dataset_name: college_biology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_chemistry.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_chemistry.yaml new file mode 100644 index 0000000000..17d63b60bb --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_chemistry.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_chemistry +dataset_path: OALL/Arabic_MMLU +dataset_name: college_chemistry +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_computer_science.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_computer_science.yaml new file mode 100644 index 0000000000..a3f5d3e84c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_computer_science.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_computer_science +dataset_path: OALL/Arabic_MMLU +dataset_name: college_computer_science +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_mathematics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_mathematics.yaml new file mode 100644 index 0000000000..0284093dd9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_mathematics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_mathematics +dataset_path: OALL/Arabic_MMLU +dataset_name: college_mathematics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_medicine.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_medicine.yaml new file mode 100644 index 0000000000..e21246e7be --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_medicine.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_medicine +dataset_path: OALL/Arabic_MMLU +dataset_name: college_medicine +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_physics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_physics.yaml new file mode 100644 index 0000000000..ab23f490f3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_college_physics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_physics +dataset_path: OALL/Arabic_MMLU +dataset_name: college_physics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_computer_security.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_computer_security.yaml new file mode 100644 index 0000000000..96624cd02f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_computer_security.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_computer_security +dataset_path: OALL/Arabic_MMLU +dataset_name: computer_security +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_conceptual_physics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_conceptual_physics.yaml new file mode 100644 index 0000000000..cd605de40a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_conceptual_physics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_conceptual_physics +dataset_path: OALL/Arabic_MMLU +dataset_name: conceptual_physics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_econometrics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_econometrics.yaml new file mode 100644 index 0000000000..60c9f373a3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_econometrics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_econometrics +dataset_path: OALL/Arabic_MMLU +dataset_name: econometrics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_electrical_engineering.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_electrical_engineering.yaml new file mode 100644 index 0000000000..83aa42a620 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_electrical_engineering.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_electrical_engineering +dataset_path: OALL/Arabic_MMLU +dataset_name: electrical_engineering +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_elementary_mathematics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_elementary_mathematics.yaml new file mode 100644 index 0000000000..ac06d9ec7c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_elementary_mathematics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_elementary_mathematics +dataset_path: OALL/Arabic_MMLU +dataset_name: elementary_mathematics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_formal_logic.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_formal_logic.yaml new file mode 100644 index 0000000000..5e1d60758b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_formal_logic.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_formal_logic +dataset_path: OALL/Arabic_MMLU +dataset_name: formal_logic +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_global_facts.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_global_facts.yaml new file mode 100644 index 0000000000..074248d8fe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_global_facts.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_global_facts +dataset_path: OALL/Arabic_MMLU +dataset_name: global_facts +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_biology.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_biology.yaml new file mode 100644 index 0000000000..09862e1ce6 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_biology.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_biology +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_biology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_chemistry.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_chemistry.yaml new file mode 100644 index 0000000000..849ad63ed7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_chemistry.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_chemistry +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_chemistry +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_computer_science.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_computer_science.yaml new file mode 100644 index 0000000000..e91bfe7fb9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_computer_science.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_computer_science +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_computer_science +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_european_history.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_european_history.yaml new file mode 100644 index 0000000000..912e57bfab --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_european_history.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_european_history +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_european_history +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_geography.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_geography.yaml new file mode 100644 index 0000000000..33c41db0f1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_geography.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_geography +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_geography +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics.yaml new file mode 100644 index 0000000000..16689f115f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_government_and_politics +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_government_and_politics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics.yaml new file mode 100644 index 0000000000..04ec5d7431 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_macroeconomics +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_macroeconomics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_mathematics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_mathematics.yaml new file mode 100644 index 0000000000..fd4ebd5161 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_mathematics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_mathematics +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_mathematics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_microeconomics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_microeconomics.yaml new file mode 100644 index 0000000000..7ba3eea694 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_microeconomics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_microeconomics +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_microeconomics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_physics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_physics.yaml new file mode 100644 index 0000000000..8d53cca80e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_physics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_physics +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_physics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_psychology.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_psychology.yaml new file mode 100644 index 0000000000..129733d1dd --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_psychology.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_psychology +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_psychology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_statistics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_statistics.yaml new file mode 100644 index 0000000000..b23e1a77e5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_statistics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_statistics +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_statistics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_us_history.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_us_history.yaml new file mode 100644 index 0000000000..cc6ec9a397 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_us_history.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_us_history +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_us_history +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_world_history.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_world_history.yaml new file mode 100644 index 0000000000..b537669fec --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_high_school_world_history.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_world_history +dataset_path: OALL/Arabic_MMLU +dataset_name: high_school_world_history +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_aging.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_aging.yaml new file mode 100644 index 0000000000..62124769b1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_aging.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_human_aging +dataset_path: OALL/Arabic_MMLU +dataset_name: human_aging +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_sexuality.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_sexuality.yaml new file mode 100644 index 0000000000..bf6c298b8a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_human_sexuality.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_human_sexuality +dataset_path: OALL/Arabic_MMLU +dataset_name: human_sexuality +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_international_law.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_international_law.yaml new file mode 100644 index 0000000000..feec16f59b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_international_law.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_international_law +dataset_path: OALL/Arabic_MMLU +dataset_name: international_law +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_jurisprudence.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_jurisprudence.yaml new file mode 100644 index 0000000000..fcc1a3ab9c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_jurisprudence.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_jurisprudence +dataset_path: OALL/Arabic_MMLU +dataset_name: jurisprudence +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_logical_fallacies.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_logical_fallacies.yaml new file mode 100644 index 0000000000..c6de637bae --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_logical_fallacies.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_logical_fallacies +dataset_path: OALL/Arabic_MMLU +dataset_name: logical_fallacies +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_machine_learning.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_machine_learning.yaml new file mode 100644 index 0000000000..bf191fc7c8 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_machine_learning.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_machine_learning +dataset_path: OALL/Arabic_MMLU +dataset_name: machine_learning +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_management.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_management.yaml new file mode 100644 index 0000000000..4bbc800cfe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_management.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_management +dataset_path: OALL/Arabic_MMLU +dataset_name: management +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_marketing.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_marketing.yaml new file mode 100644 index 0000000000..59694487eb --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_marketing.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_marketing +dataset_path: OALL/Arabic_MMLU +dataset_name: marketing +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_medical_genetics.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_medical_genetics.yaml new file mode 100644 index 0000000000..88f0de37c3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_medical_genetics.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_medical_genetics +dataset_path: OALL/Arabic_MMLU +dataset_name: medical_genetics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_miscellaneous.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_miscellaneous.yaml new file mode 100644 index 0000000000..da333e4536 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_miscellaneous.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_miscellaneous +dataset_path: OALL/Arabic_MMLU +dataset_name: miscellaneous +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_disputes.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_disputes.yaml new file mode 100644 index 0000000000..1d0d07945f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_disputes.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_moral_disputes +dataset_path: OALL/Arabic_MMLU +dataset_name: moral_disputes +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_scenarios.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_scenarios.yaml new file mode 100644 index 0000000000..c0c924650f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_moral_scenarios.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_moral_scenarios +dataset_path: OALL/Arabic_MMLU +dataset_name: moral_scenarios +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_nutrition.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_nutrition.yaml new file mode 100644 index 0000000000..24ad69b90d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_nutrition.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_nutrition +dataset_path: OALL/Arabic_MMLU +dataset_name: nutrition +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_philosophy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_philosophy.yaml new file mode 100644 index 0000000000..a57dcf7ecd --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_philosophy.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_philosophy +dataset_path: OALL/Arabic_MMLU +dataset_name: philosophy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_prehistory.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_prehistory.yaml new file mode 100644 index 0000000000..45ba2e5de2 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_prehistory.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_prehistory +dataset_path: OALL/Arabic_MMLU +dataset_name: prehistory +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_accounting.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_accounting.yaml new file mode 100644 index 0000000000..d931a00099 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_accounting.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_accounting +dataset_path: OALL/Arabic_MMLU +dataset_name: professional_accounting +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_law.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_law.yaml new file mode 100644 index 0000000000..e11d0368f5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_law.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_law +dataset_path: OALL/Arabic_MMLU +dataset_name: professional_law +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_medicine.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_medicine.yaml new file mode 100644 index 0000000000..7a10d8157f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_medicine.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_medicine +dataset_path: OALL/Arabic_MMLU +dataset_name: professional_medicine +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_psychology.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_psychology.yaml new file mode 100644 index 0000000000..bb12274adb --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_professional_psychology.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_psychology +dataset_path: OALL/Arabic_MMLU +dataset_name: professional_psychology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_public_relations.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_public_relations.yaml new file mode 100644 index 0000000000..3361f775b4 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_public_relations.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_public_relations +dataset_path: OALL/Arabic_MMLU +dataset_name: public_relations +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_security_studies.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_security_studies.yaml new file mode 100644 index 0000000000..781a6145f0 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_security_studies.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_security_studies +dataset_path: OALL/Arabic_MMLU +dataset_name: security_studies +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_sociology.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_sociology.yaml new file mode 100644 index 0000000000..2c80872c97 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_sociology.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_sociology +dataset_path: OALL/Arabic_MMLU +dataset_name: sociology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_us_foreign_policy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_us_foreign_policy.yaml new file mode 100644 index 0000000000..f767e0a78d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_us_foreign_policy.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_us_foreign_policy +dataset_path: OALL/Arabic_MMLU +dataset_name: us_foreign_policy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_virology.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_virology.yaml new file mode 100644 index 0000000000..8103face6c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_virology.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_virology +dataset_path: OALL/Arabic_MMLU +dataset_name: virology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_world_religions.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_world_religions.yaml new file mode 100644 index 0000000000..31c563cc53 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/arabic_leaderboard_arabic_mmlu_world_religions.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_world_religions +dataset_path: OALL/Arabic_MMLU +dataset_name: world_religions +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/utils.py new file mode 100644 index 0000000000..da927b66fc --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mmlu/utils.py @@ -0,0 +1,35 @@ +import datasets +import numpy as np + + +# fmt: off +LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] +# fmt: on + + +# fmt: off +LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] +# fmt: on + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + topic = doc["subject"] + instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" + choices = [doc["A"], doc["B"], doc["C"], doc["D"]] + # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, + # it will then be applied to arabic letters + gold_ix = LETTER_INDICES.index(doc["answer"]) + + query = f"{instruction}{doc['question']}\n" + query += "".join( + [ + f"{key}. {choice}\n" + for key, choice in zip(LETTER_INDICES_AR[:4], choices) + ] + ) + query += "الإجابة:" + + return {"query": query, "choices": LETTER_INDICES_AR[:4], "gold": gold_ix} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_leaderboard_arabic_mt_arc_challenge.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_leaderboard_arabic_mt_arc_challenge.yaml new file mode 100644 index 0000000000..f49aed0716 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_leaderboard_arabic_mt_arc_challenge.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_arc_challenge +task: + - arabic_mt_arc_challenge + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_mt_arc_challenge.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_mt_arc_challenge.yaml new file mode 100644 index 0000000000..e0b245aabb --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/arabic_mt_arc_challenge.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_arc_challenge +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: arc_challenge_okapi_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_challenge/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_leaderboard_arabic_mt_arc_easy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_leaderboard_arabic_mt_arc_easy.yaml new file mode 100644 index 0000000000..6abd5fa21b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_leaderboard_arabic_mt_arc_easy.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_arc_easy +task: + - arabic_mt_arc_easy + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_mt_arc_easy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_mt_arc_easy.yaml new file mode 100644 index 0000000000..b629529f06 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/arabic_mt_arc_easy.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_arc_easy +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: arc_easy_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_arc_easy/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_leaderboard_arabic_mt_boolq.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_leaderboard_arabic_mt_boolq.yaml new file mode 100644 index 0000000000..5072f01dd7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_leaderboard_arabic_mt_boolq.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_boolq +task: + - arabic_mt_boolq + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml new file mode 100644 index 0000000000..299570af81 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/arabic_mt_boolq.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_boolq +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: boolq_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/utils.py new file mode 100644 index 0000000000..dcbc10d92e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_boolq/utils.py @@ -0,0 +1,24 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["question"] + passage = doc["passage"] + instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا" + query = f"""{instruction} + المقطع : + {passage} + السؤال: + {question} + الإجابة: + """ + + return { + "query": query, + "choices": ["نعم", "لا"], + "gold": 0 if doc["answer"] else 1, + } + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_leaderboard_arabic_mt_copa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_leaderboard_arabic_mt_copa.yaml new file mode 100644 index 0000000000..3ef88d9c37 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_leaderboard_arabic_mt_copa.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_copa +task: + - arabic_mt_copa + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_mt_copa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_mt_copa.yaml new file mode 100644 index 0000000000..e9483e1de5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/arabic_mt_copa.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_copa +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: copa_ext_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/utils.py new file mode 100644 index 0000000000..175ebdadc1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_copa/utils.py @@ -0,0 +1,19 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + premise = doc["premise"] + choices = [doc["choice1"], doc["choice2"]] + question_map = {"cause": "لأن", "effect": "لذلك"} + question = question_map[doc["question"]] + answer = doc["label"] + + query = "{}، {} :\n0) {}\n1) {}\nالإجابة:".format( + premise, question, choices[0], choices[1] + ) + + return {"query": query, "choices": choices, "gold": answer} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_leaderboard_arabic_mt_hellaswag.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_leaderboard_arabic_mt_hellaswag.yaml new file mode 100644 index 0000000000..a70f5ab68d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_leaderboard_arabic_mt_hellaswag.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_hellaswag +task: + - arabic_mt_hellaswag + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_mt_hellaswag.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_mt_hellaswag.yaml new file mode 100644 index 0000000000..59a4547485 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/arabic_mt_hellaswag.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_hellaswag +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: hellaswag_okapi_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/utils.py new file mode 100644 index 0000000000..6b5a9f1f4f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_hellaswag/utils.py @@ -0,0 +1,30 @@ +import re + +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + ctx = re.sub(r"\[.*?\]", "", doc["ctx"]) # Remove latin words within brackets + endings = [ + re.sub(r"\[.*?\]", "", e) for e in eval(doc["endings"]) + ] # endings is a string representation of a list + answer_index = doc["label"] + instruction = ( + "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية" + ) + + query = f"""{instruction} + السياق: + {ctx} + الاقتراحات: + + """ + for i, ending in enumerate(endings): + query += f"{i}) {ending}\n" + query += "الإجابة:" + + return {"query": query, "choices": endings, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_leaderboard_arabic_mt_mmlu.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_leaderboard_arabic_mt_mmlu.yaml new file mode 100644 index 0000000000..0188b5ddc4 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_leaderboard_arabic_mt_mmlu.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_mmlu +task: + - arabic_mt_mmlu + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_mt_mmlu.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_mt_mmlu.yaml new file mode 100644 index 0000000000..4f3cd249c2 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/arabic_mt_mmlu.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_mmlu +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: mmlu_okapi_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_mmlu/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_leaderboard_arabic_mt_openbook_qa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_leaderboard_arabic_mt_openbook_qa.yaml new file mode 100644 index 0000000000..dd3b78f4d0 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_leaderboard_arabic_mt_openbook_qa.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_openbook_qa +task: + - arabic_mt_openbook_qa + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_mt_openbook_qa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_mt_openbook_qa.yaml new file mode 100644 index 0000000000..b826a18927 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/arabic_mt_openbook_qa.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_openbook_qa +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: openbook_qa_ext_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_openbook_qa/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_leaderboard_arabic_mt_piqa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_leaderboard_arabic_mt_piqa.yaml new file mode 100644 index 0000000000..b75bcc2b1c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_leaderboard_arabic_mt_piqa.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_piqa +task: + - arabic_mt_piqa + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_mt_piqa.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_mt_piqa.yaml new file mode 100644 index 0000000000..fa93a937a8 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/arabic_mt_piqa.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_piqa +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: piqa_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_piqa/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_leaderboard_arabic_mt_race.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_leaderboard_arabic_mt_race.yaml new file mode 100644 index 0000000000..f3f91c278d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_leaderboard_arabic_mt_race.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_race +task: + - arabic_mt_race + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_mt_race.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_mt_race.yaml new file mode 100644 index 0000000000..ec2aee6898 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/arabic_mt_race.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_race +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: race_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_race/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_leaderboard_arabic_mt_sciq.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_leaderboard_arabic_mt_sciq.yaml new file mode 100644 index 0000000000..7768047c4c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_leaderboard_arabic_mt_sciq.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_sciq +task: + - arabic_mt_sciq + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_mt_sciq.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_mt_sciq.yaml new file mode 100644 index 0000000000..07f96b7574 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/arabic_mt_sciq.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_sciq +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: sciq_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/utils.py new file mode 100644 index 0000000000..ddb42eeb8c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_sciq/utils.py @@ -0,0 +1,41 @@ +import random + +import datasets +import numpy as np + + +def doc_to_text(doc): + instruction = ( + "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات" + ) + support = doc["support"] + question = doc["question"] + query = f"""{instruction} + السياق: + {support} + السؤال: + {question} + الإجابات المحتملة: + + """ + return query + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + correct_answer = doc["correct_answer"] + choices = [ + doc["distractor1"], + doc["distractor2"], + doc["distractor3"], + correct_answer, + ] + + # Shuffle the choices + random.shuffle(choices) + + answer_index = choices.index(correct_answer) + + return {"query": doc_to_text(doc), "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_leaderboard_arabic_mt_toxigen.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_leaderboard_arabic_mt_toxigen.yaml new file mode 100644 index 0000000000..272166206b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_leaderboard_arabic_mt_toxigen.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_toxigen +task: + - arabic_mt_toxigen + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_mt_toxigen.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_mt_toxigen.yaml new file mode 100644 index 0000000000..8a140793d7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/arabic_mt_toxigen.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_toxigen +dataset_path: OALL/AlGhafa-Arabic-LLM-Benchmark-Translated +dataset_name: toxigen_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/utils.py new file mode 100644 index 0000000000..09d311e5a1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_arabic_mt_toxigen/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + text = doc["text"] + label = 1 if ((doc["toxicity_ai"] + doc["toxicity_human"]) > 5.5) else 0 + instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".' + + query = f"""{instruction} + العبارة: + '{text}' + الإجابة: + """ + out_doc = { + "query": query, + "choices": ["لا", "نعم"], + "gold": label, + } + return out_doc + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva.yaml new file mode 100644 index 0000000000..8e2dab57b0 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva.yaml @@ -0,0 +1,70 @@ +group: arabic_leaderboard_acva +task: + - arabic_leaderboard_acva_Algeria + - arabic_leaderboard_acva_Ancient_Egypt + - arabic_leaderboard_acva_Arab_Empire + - arabic_leaderboard_acva_Arabic_Architecture + - arabic_leaderboard_acva_Arabic_Art + - arabic_leaderboard_acva_Arabic_Astronomy + - arabic_leaderboard_acva_Arabic_Calligraphy + - arabic_leaderboard_acva_Arabic_Ceremony + - arabic_leaderboard_acva_Arabic_Clothing + - arabic_leaderboard_acva_Arabic_Culture + - arabic_leaderboard_acva_Arabic_Food + - arabic_leaderboard_acva_Arabic_Funeral + - arabic_leaderboard_acva_Arabic_Geography + - arabic_leaderboard_acva_Arabic_History + - arabic_leaderboard_acva_Arabic_Language_Origin + - arabic_leaderboard_acva_Arabic_Literature + - arabic_leaderboard_acva_Arabic_Math + - arabic_leaderboard_acva_Arabic_Medicine + - arabic_leaderboard_acva_Arabic_Music + - arabic_leaderboard_acva_Arabic_Ornament + - arabic_leaderboard_acva_Arabic_Philosophy + - arabic_leaderboard_acva_Arabic_Physics_and_Chemistry + - arabic_leaderboard_acva_Arabic_Wedding + - arabic_leaderboard_acva_Bahrain + - arabic_leaderboard_acva_Comoros + - arabic_leaderboard_acva_Egypt_modern + - arabic_leaderboard_acva_InfluenceFromAncientEgypt + - arabic_leaderboard_acva_InfluenceFromByzantium + - arabic_leaderboard_acva_InfluenceFromChina + - arabic_leaderboard_acva_InfluenceFromGreece + - arabic_leaderboard_acva_InfluenceFromIslam + - arabic_leaderboard_acva_InfluenceFromPersia + - arabic_leaderboard_acva_InfluenceFromRome + - arabic_leaderboard_acva_Iraq + - arabic_leaderboard_acva_Islam_Education + - arabic_leaderboard_acva_Islam_branches_and_schools + - arabic_leaderboard_acva_Islamic_law_system + - arabic_leaderboard_acva_Jordan + - arabic_leaderboard_acva_Kuwait + - arabic_leaderboard_acva_Lebanon + - arabic_leaderboard_acva_Libya + - arabic_leaderboard_acva_Mauritania + - arabic_leaderboard_acva_Mesopotamia_civilization + - arabic_leaderboard_acva_Morocco + - arabic_leaderboard_acva_Oman + - arabic_leaderboard_acva_Palestine + - arabic_leaderboard_acva_Qatar + - arabic_leaderboard_acva_Saudi_Arabia + - arabic_leaderboard_acva_Somalia + - arabic_leaderboard_acva_Sudan + - arabic_leaderboard_acva_Syria + - arabic_leaderboard_acva_Tunisia + - arabic_leaderboard_acva_United_Arab_Emirates + - arabic_leaderboard_acva_Yemen + - arabic_leaderboard_acva_communication + - arabic_leaderboard_acva_computer_and_phone + - arabic_leaderboard_acva_daily_life + - arabic_leaderboard_acva_entertainment + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Algeria.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Algeria.yaml new file mode 100644 index 0000000000..177161edaa --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Algeria.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Algeria +dataset_path: OALL/ACVA +dataset_name: Algeria +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Ancient_Egypt.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Ancient_Egypt.yaml new file mode 100644 index 0000000000..ddb5c35555 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Ancient_Egypt.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Ancient_Egypt +dataset_path: OALL/ACVA +dataset_name: Ancient_Egypt +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arab_Empire.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arab_Empire.yaml new file mode 100644 index 0000000000..b510de5ab9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arab_Empire.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arab_Empire +dataset_path: OALL/ACVA +dataset_name: Arab_Empire +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Architecture.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Architecture.yaml new file mode 100644 index 0000000000..5dc2c07dee --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Architecture.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Architecture +dataset_path: OALL/ACVA +dataset_name: Arabic_Architecture +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Art.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Art.yaml new file mode 100644 index 0000000000..36f364bc50 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Art.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Art +dataset_path: OALL/ACVA +dataset_name: Arabic_Art +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Astronomy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Astronomy.yaml new file mode 100644 index 0000000000..f90b1c9140 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Astronomy.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Astronomy +dataset_path: OALL/ACVA +dataset_name: Arabic_Astronomy +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Calligraphy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Calligraphy.yaml new file mode 100644 index 0000000000..dfdf51878b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Calligraphy.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Calligraphy +dataset_path: OALL/ACVA +dataset_name: Arabic_Calligraphy +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ceremony.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ceremony.yaml new file mode 100644 index 0000000000..c20b4439e2 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ceremony.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Ceremony +dataset_path: OALL/ACVA +dataset_name: Arabic_Ceremony +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Clothing.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Clothing.yaml new file mode 100644 index 0000000000..06118034dc --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Clothing.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Clothing +dataset_path: OALL/ACVA +dataset_name: Arabic_Clothing +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Culture.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Culture.yaml new file mode 100644 index 0000000000..cea33022b4 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Culture.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Culture +dataset_path: OALL/ACVA +dataset_name: Arabic_Culture +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Food.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Food.yaml new file mode 100644 index 0000000000..cca516c972 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Food.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Food +dataset_path: OALL/ACVA +dataset_name: Arabic_Food +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Funeral.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Funeral.yaml new file mode 100644 index 0000000000..3dd8fbedd9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Funeral.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Funeral +dataset_path: OALL/ACVA +dataset_name: Arabic_Funeral +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Geography.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Geography.yaml new file mode 100644 index 0000000000..89aa7361b3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Geography.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Geography +dataset_path: OALL/ACVA +dataset_name: Arabic_Geography +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_History.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_History.yaml new file mode 100644 index 0000000000..776589c07b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_History.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_History +dataset_path: OALL/ACVA +dataset_name: Arabic_History +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Language_Origin.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Language_Origin.yaml new file mode 100644 index 0000000000..4f0612acaf --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Language_Origin.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Language_Origin +dataset_path: OALL/ACVA +dataset_name: Arabic_Language_Origin +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Literature.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Literature.yaml new file mode 100644 index 0000000000..0c9198f446 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Literature.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Literature +dataset_path: OALL/ACVA +dataset_name: Arabic_Literature +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Math.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Math.yaml new file mode 100644 index 0000000000..02a3643024 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Math.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Math +dataset_path: OALL/ACVA +dataset_name: Arabic_Math +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Medicine.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Medicine.yaml new file mode 100644 index 0000000000..109aae994a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Medicine.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Medicine +dataset_path: OALL/ACVA +dataset_name: Arabic_Medicine +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Music.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Music.yaml new file mode 100644 index 0000000000..2559625784 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Music.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Music +dataset_path: OALL/ACVA +dataset_name: Arabic_Music +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ornament.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ornament.yaml new file mode 100644 index 0000000000..00311e107e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Ornament.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Ornament +dataset_path: OALL/ACVA +dataset_name: Arabic_Ornament +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Philosophy.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Philosophy.yaml new file mode 100644 index 0000000000..62a570f00c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Philosophy.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Philosophy +dataset_path: OALL/ACVA +dataset_name: Arabic_Philosophy +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry.yaml new file mode 100644 index 0000000000..b1b52096e4 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Physics_and_Chemistry +dataset_path: OALL/ACVA +dataset_name: Arabic_Physics_and_Chemistry +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Wedding.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Wedding.yaml new file mode 100644 index 0000000000..21205cfff8 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Arabic_Wedding.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Wedding +dataset_path: OALL/ACVA +dataset_name: Arabic_Wedding +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Bahrain.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Bahrain.yaml new file mode 100644 index 0000000000..3b2481bc87 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Bahrain.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Bahrain +dataset_path: OALL/ACVA +dataset_name: Bahrain +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Comoros.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Comoros.yaml new file mode 100644 index 0000000000..be4df372c7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Comoros.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Comoros +dataset_path: OALL/ACVA +dataset_name: Comoros +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Egypt_modern.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Egypt_modern.yaml new file mode 100644 index 0000000000..26ca2f6e08 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Egypt_modern.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Egypt_modern +dataset_path: OALL/ACVA +dataset_name: Egypt_modern +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromAncientEgypt.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromAncientEgypt.yaml new file mode 100644 index 0000000000..be300fc869 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromAncientEgypt.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromAncientEgypt +dataset_path: OALL/ACVA +dataset_name: InfluenceFromAncientEgypt +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromByzantium.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromByzantium.yaml new file mode 100644 index 0000000000..72c86a6247 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromByzantium.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromByzantium +dataset_path: OALL/ACVA +dataset_name: InfluenceFromByzantium +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromChina.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromChina.yaml new file mode 100644 index 0000000000..b297642cbe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromChina.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromChina +dataset_path: OALL/ACVA +dataset_name: InfluenceFromChina +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromGreece.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromGreece.yaml new file mode 100644 index 0000000000..70458ea2d3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromGreece.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromGreece +dataset_path: OALL/ACVA +dataset_name: InfluenceFromGreece +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromIslam.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromIslam.yaml new file mode 100644 index 0000000000..803f33345d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromIslam.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromIslam +dataset_path: OALL/ACVA +dataset_name: InfluenceFromIslam +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromPersia.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromPersia.yaml new file mode 100644 index 0000000000..117ca89079 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromPersia.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromPersia +dataset_path: OALL/ACVA +dataset_name: InfluenceFromPersia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromRome.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromRome.yaml new file mode 100644 index 0000000000..1655522e5a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_InfluenceFromRome.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromRome +dataset_path: OALL/ACVA +dataset_name: InfluenceFromRome +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Iraq.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Iraq.yaml new file mode 100644 index 0000000000..909c6678c7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Iraq.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Iraq +dataset_path: OALL/ACVA +dataset_name: Iraq +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_Education.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_Education.yaml new file mode 100644 index 0000000000..13c1fab2a0 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_Education.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Islam_Education +dataset_path: OALL/ACVA +dataset_name: Islam_Education +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_branches_and_schools.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_branches_and_schools.yaml new file mode 100644 index 0000000000..6985b24a74 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islam_branches_and_schools.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Islam_branches_and_schools +dataset_path: OALL/ACVA +dataset_name: Islam_branches_and_schools +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islamic_law_system.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islamic_law_system.yaml new file mode 100644 index 0000000000..d19a52ba03 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Islamic_law_system.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Islamic_law_system +dataset_path: OALL/ACVA +dataset_name: Islamic_law_system +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Jordan.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Jordan.yaml new file mode 100644 index 0000000000..7bff93a94c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Jordan.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Jordan +dataset_path: OALL/ACVA +dataset_name: Jordan +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Kuwait.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Kuwait.yaml new file mode 100644 index 0000000000..b1ae77aaa5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Kuwait.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Kuwait +dataset_path: OALL/ACVA +dataset_name: Kuwait +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Lebanon.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Lebanon.yaml new file mode 100644 index 0000000000..65974b74dc --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Lebanon.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Lebanon +dataset_path: OALL/ACVA +dataset_name: Lebanon +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Libya.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Libya.yaml new file mode 100644 index 0000000000..c8b339650c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Libya.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Libya +dataset_path: OALL/ACVA +dataset_name: Libya +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mauritania.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mauritania.yaml new file mode 100644 index 0000000000..1b84074abc --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mauritania.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Mauritania +dataset_path: OALL/ACVA +dataset_name: Mauritania +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mesopotamia_civilization.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mesopotamia_civilization.yaml new file mode 100644 index 0000000000..4218947702 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Mesopotamia_civilization.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Mesopotamia_civilization +dataset_path: OALL/ACVA +dataset_name: Mesopotamia_civilization +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Morocco.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Morocco.yaml new file mode 100644 index 0000000000..4ed1510bb5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Morocco.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Morocco +dataset_path: OALL/ACVA +dataset_name: Morocco +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Oman.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Oman.yaml new file mode 100644 index 0000000000..b534cfb19f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Oman.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Oman +dataset_path: OALL/ACVA +dataset_name: Oman +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Palestine.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Palestine.yaml new file mode 100644 index 0000000000..1cb9b56a85 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Palestine.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Palestine +dataset_path: OALL/ACVA +dataset_name: Palestine +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Qatar.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Qatar.yaml new file mode 100644 index 0000000000..5d5775ccd9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Qatar.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Qatar +dataset_path: OALL/ACVA +dataset_name: Qatar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Saudi_Arabia.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Saudi_Arabia.yaml new file mode 100644 index 0000000000..5010723661 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Saudi_Arabia.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Saudi_Arabia +dataset_path: OALL/ACVA +dataset_name: Saudi_Arabia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Somalia.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Somalia.yaml new file mode 100644 index 0000000000..d40b578221 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Somalia.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Somalia +dataset_path: OALL/ACVA +dataset_name: Somalia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Sudan.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Sudan.yaml new file mode 100644 index 0000000000..e7c2f41a3b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Sudan.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Sudan +dataset_path: OALL/ACVA +dataset_name: Sudan +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Syria.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Syria.yaml new file mode 100644 index 0000000000..98ebff9fca --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Syria.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Syria +dataset_path: OALL/ACVA +dataset_name: Syria +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Tunisia.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Tunisia.yaml new file mode 100644 index 0000000000..d86e428cc3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Tunisia.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Tunisia +dataset_path: OALL/ACVA +dataset_name: Tunisia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_United_Arab_Emirates.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_United_Arab_Emirates.yaml new file mode 100644 index 0000000000..f41b625508 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_United_Arab_Emirates.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_United_Arab_Emirates +dataset_path: OALL/ACVA +dataset_name: United_Arab_Emirates +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Yemen.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Yemen.yaml new file mode 100644 index 0000000000..b239dd514a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_Yemen.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Yemen +dataset_path: OALL/ACVA +dataset_name: Yemen +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_communication.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_communication.yaml new file mode 100644 index 0000000000..beb954efce --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_communication.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_communication +dataset_path: OALL/ACVA +dataset_name: communication +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_computer_and_phone.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_computer_and_phone.yaml new file mode 100644 index 0000000000..888f82af92 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_computer_and_phone.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_computer_and_phone +dataset_path: OALL/ACVA +dataset_name: computer_and_phone +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_daily_life.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_daily_life.yaml new file mode 100644 index 0000000000..0b4748a297 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_daily_life.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_daily_life +dataset_path: OALL/ACVA +dataset_name: daily_life +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_entertainment.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_entertainment.yaml new file mode 100644 index 0000000000..b2adcfb954 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/arabic_leaderboard_acva_entertainment.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_entertainment +dataset_path: OALL/ACVA +dataset_name: entertainment +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/utils.py b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/utils.py new file mode 100644 index 0000000000..7e91496f59 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_avca/utils.py @@ -0,0 +1,16 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["question"] + answer = doc["answer"] + + return { + "query": f"السؤال: {question}\nالإجابة:", + "choices": ["صح", "خطأ"], + "gold": ["صح", "خطأ"].index(answer), + } + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_complete.yaml b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_complete.yaml new file mode 100644 index 0000000000..c26370157d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_complete/arabic_leaderboard_complete.yaml @@ -0,0 +1,25 @@ +group: arabic_leaderboard_complete +task: + - arabic_leaderboard_acva + - arabic_leaderboard_alghafa + - arabic_leaderboard_arabic_exams + - arabic_leaderboard_arabic_mt_arc_challenge + - arabic_leaderboard_arabic_mt_arc_easy + - arabic_leaderboard_arabic_mt_boolq + - arabic_leaderboard_arabic_mt_hellaswag + - arabic_leaderboard_arabic_mt_mmlu + - arabic_leaderboard_arabic_mt_copa + - arabic_leaderboard_arabic_mt_openbook_qa + - arabic_leaderboard_arabic_mt_piqa + - arabic_leaderboard_arabic_mt_race + - arabic_leaderboard_arabic_mt_sciq + - arabic_leaderboard_arabic_mt_toxigen +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/README.md b/lm_eval/tasks/arabic_leaderboard_light/README.md new file mode 100644 index 0000000000..199aa2c8da --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/README.md @@ -0,0 +1,20 @@ +# Arabic Leaderboard Light + +Title: Open Arabic LLM Leaderboard Light + +This leaderboard follows all the details as in [`arabic_leaderboard_complete`](../arabic_leaderboard_complete), except that a light version - 10% random sample of the test set of each benchmark - is used to test the language models. + +NOTE: In ACVA benchmark, there is Yemen subset, and it is a small dataset - it has only 10 samples in the test split. So, for this specific subset dataset, to have more reliable results, we consider the original dataset, instead of 10% of its test samples. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_light.yaml new file mode 100644 index 0000000000..0ee6a568d9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_light.yaml @@ -0,0 +1,23 @@ +group: arabic_leaderboard_alghafa_light +task: + - arabic_leaderboard_alghafa_mcq_exams_test_ar_light + - arabic_leaderboard_alghafa_meta_ar_dialects_light + - arabic_leaderboard_alghafa_meta_ar_msa_light + - arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light + - arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light + - arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light + - arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light + - arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light + - arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light + + + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_mcq_exams_test_ar_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_mcq_exams_test_ar_light.yaml new file mode 100644 index 0000000000..1fdda36405 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_mcq_exams_test_ar_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_mcq_exams_test_ar_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: mcq_exams_test_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_dialects_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_dialects_light.yaml new file mode 100644 index 0000000000..47af55b86a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_dialects_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_meta_ar_dialects_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: meta_ar_dialects +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_msa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_msa_light.yaml new file mode 100644 index 0000000000..9a26a2653f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_meta_ar_msa_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_meta_ar_msa_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: meta_ar_msa +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light.yaml new file mode 100644 index 0000000000..b56ddfee19 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_facts_truefalse_balanced_task_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: multiple_choice_facts_truefalse_balanced_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light.yaml new file mode 100644 index 0000000000..4d85c68491 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_grounded_statement_soqal_task_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: multiple_choice_grounded_statement_soqal_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light.yaml new file mode 100644 index 0000000000..e5d8afefea --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_grounded_statement_xglue_mlqa_task_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: multiple_choice_grounded_statement_xglue_mlqa_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light.yaml new file mode 100644 index 0000000000..21721d2a2d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_no_neutral_task_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: multiple_choice_rating_sentiment_no_neutral_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light.yaml new file mode 100644 index 0000000000..39f72e4d2a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_rating_sentiment_task_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: multiple_choice_rating_sentiment_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light.yaml new file mode 100644 index 0000000000..28b0701561 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_alghafa_multiple_choice_sentiment_task_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Native-10percent +dataset_name: multiple_choice_sentiment_task +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_alghafa_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_exams_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_exams_light.yaml new file mode 100644 index 0000000000..2348be4eb3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_exams_light.yaml @@ -0,0 +1,23 @@ +task: arabic_exams_light +dataset_path: arcee-globe/Arabic_EXAMS-10percent +dataset_name: default +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_leaderboard_arabic_exams_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_leaderboard_arabic_exams_light.yaml new file mode 100644 index 0000000000..296a47cbb4 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/arabic_leaderboard_arabic_exams_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_exams_light +task: + - arabic_exams_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/utils.py new file mode 100644 index 0000000000..72af1c40fe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_exams_light/utils.py @@ -0,0 +1,33 @@ +import datasets +import numpy as np + + +# fmt: off +LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] +# fmt: on + + +# fmt: off +LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] +# fmt: on + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + topic = doc["subject"] + question = doc["question"] + choices = [doc["A"], doc["B"], doc["C"], doc["D"]] + choices_formatted = [ + f" {LETTER_INDICES_AR[i]}) {choice}\n" for i, choice in enumerate(choices) + ] + answer = doc["answer"] + answer_index = LETTER_INDICES.index(answer) + + instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" + query = f"{instruction}السؤال: {question}\n" + query += "\n".join(choices_formatted) + query += "\nالإجابة:" + + return {"query": query, "choices": LETTER_INDICES_AR[:4], "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_abstract_algebra_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_abstract_algebra_light.yaml new file mode 100644 index 0000000000..dcb59fc361 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_abstract_algebra_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_abstract_algebra_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: abstract_algebra +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_anatomy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_anatomy_light.yaml new file mode 100644 index 0000000000..fc77a66dde --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_anatomy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_anatomy_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: anatomy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_astronomy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_astronomy_light.yaml new file mode 100644 index 0000000000..db4a9b4360 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_astronomy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_astronomy_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: astronomy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_business_ethics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_business_ethics_light.yaml new file mode 100644 index 0000000000..a747dbafaf --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_business_ethics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_business_ethics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: business_ethics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_clinical_knowledge_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_clinical_knowledge_light.yaml new file mode 100644 index 0000000000..1296b90cbc --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_clinical_knowledge_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_clinical_knowledge_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: clinical_knowledge +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_biology_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_biology_light.yaml new file mode 100644 index 0000000000..cbfc804974 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_biology_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_biology_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: college_biology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_chemistry_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_chemistry_light.yaml new file mode 100644 index 0000000000..ac0970355b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_chemistry_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_chemistry_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: college_chemistry +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_computer_science_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_computer_science_light.yaml new file mode 100644 index 0000000000..361274d64a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_computer_science_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_computer_science_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: college_computer_science +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_mathematics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_mathematics_light.yaml new file mode 100644 index 0000000000..20e4d6e627 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_mathematics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_mathematics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: college_mathematics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_medicine_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_medicine_light.yaml new file mode 100644 index 0000000000..d854004973 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_medicine_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_medicine_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: college_medicine +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_physics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_physics_light.yaml new file mode 100644 index 0000000000..57e4b55033 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_college_physics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_college_physics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: college_physics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_computer_security_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_computer_security_light.yaml new file mode 100644 index 0000000000..dd8c01dc6c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_computer_security_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_computer_security_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: computer_security +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_conceptual_physics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_conceptual_physics_light.yaml new file mode 100644 index 0000000000..cffd7ee42d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_conceptual_physics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_conceptual_physics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: conceptual_physics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_econometrics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_econometrics_light.yaml new file mode 100644 index 0000000000..30413feff0 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_econometrics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_econometrics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: econometrics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_electrical_engineering_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_electrical_engineering_light.yaml new file mode 100644 index 0000000000..e60787d675 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_electrical_engineering_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_electrical_engineering_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: electrical_engineering +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_elementary_mathematics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_elementary_mathematics_light.yaml new file mode 100644 index 0000000000..571476620a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_elementary_mathematics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_elementary_mathematics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: elementary_mathematics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_formal_logic_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_formal_logic_light.yaml new file mode 100644 index 0000000000..9b2bebf1e5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_formal_logic_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_formal_logic_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: formal_logic +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_global_facts_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_global_facts_light.yaml new file mode 100644 index 0000000000..15c3b34aac --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_global_facts_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_global_facts_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: global_facts +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_biology_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_biology_light.yaml new file mode 100644 index 0000000000..906c33284d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_biology_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_biology_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_biology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_chemistry_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_chemistry_light.yaml new file mode 100644 index 0000000000..199f16b093 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_chemistry_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_chemistry_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_chemistry +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_computer_science_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_computer_science_light.yaml new file mode 100644 index 0000000000..cb23af53bb --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_computer_science_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_computer_science_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_computer_science +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_european_history_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_european_history_light.yaml new file mode 100644 index 0000000000..25a9b46695 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_european_history_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_european_history_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_european_history +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_geography_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_geography_light.yaml new file mode 100644 index 0000000000..f7f39cd2f2 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_geography_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_geography_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_geography +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light.yaml new file mode 100644 index 0000000000..dff09d6717 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_government_and_politics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light.yaml new file mode 100644 index 0000000000..ae42622353 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_macroeconomics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_mathematics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_mathematics_light.yaml new file mode 100644 index 0000000000..8adc3d7e93 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_mathematics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_mathematics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_mathematics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light.yaml new file mode 100644 index 0000000000..6eec39237b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_microeconomics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_physics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_physics_light.yaml new file mode 100644 index 0000000000..973bd1ffc5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_physics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_physics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_physics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_psychology_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_psychology_light.yaml new file mode 100644 index 0000000000..614dd7e89d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_psychology_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_psychology_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_psychology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_statistics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_statistics_light.yaml new file mode 100644 index 0000000000..2db9f196a3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_statistics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_statistics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_statistics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_us_history_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_us_history_light.yaml new file mode 100644 index 0000000000..5411e8c479 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_us_history_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_us_history_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_us_history +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_world_history_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_world_history_light.yaml new file mode 100644 index 0000000000..319c49b22b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_high_school_world_history_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_high_school_world_history_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: high_school_world_history +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_aging_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_aging_light.yaml new file mode 100644 index 0000000000..afd2eefa29 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_aging_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_human_aging_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: human_aging +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_sexuality_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_sexuality_light.yaml new file mode 100644 index 0000000000..9e245f2687 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_human_sexuality_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_human_sexuality_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: human_sexuality +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_international_law_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_international_law_light.yaml new file mode 100644 index 0000000000..6e476bb879 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_international_law_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_international_law_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: international_law +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_jurisprudence_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_jurisprudence_light.yaml new file mode 100644 index 0000000000..1d848cd173 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_jurisprudence_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_jurisprudence_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: jurisprudence +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_light.yaml new file mode 100644 index 0000000000..130713702c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_light.yaml @@ -0,0 +1,68 @@ +group: arabic_leaderboard_arabic_mmlu_light +task: + - arabic_leaderboard_arabic_mmlu_abstract_algebra_light + - arabic_leaderboard_arabic_mmlu_anatomy_light + - arabic_leaderboard_arabic_mmlu_astronomy_light + - arabic_leaderboard_arabic_mmlu_business_ethics_light + - arabic_leaderboard_arabic_mmlu_clinical_knowledge_light + - arabic_leaderboard_arabic_mmlu_college_biology_light + - arabic_leaderboard_arabic_mmlu_college_chemistry_light + - arabic_leaderboard_arabic_mmlu_college_computer_science_light + - arabic_leaderboard_arabic_mmlu_college_mathematics_light + - arabic_leaderboard_arabic_mmlu_college_medicine_light + - arabic_leaderboard_arabic_mmlu_college_physics_light + - arabic_leaderboard_arabic_mmlu_computer_security_light + - arabic_leaderboard_arabic_mmlu_conceptual_physics_light + - arabic_leaderboard_arabic_mmlu_econometrics_light + - arabic_leaderboard_arabic_mmlu_electrical_engineering_light + - arabic_leaderboard_arabic_mmlu_elementary_mathematics_light + - arabic_leaderboard_arabic_mmlu_formal_logic_light + - arabic_leaderboard_arabic_mmlu_global_facts_light + - arabic_leaderboard_arabic_mmlu_high_school_biology_light + - arabic_leaderboard_arabic_mmlu_high_school_chemistry_light + - arabic_leaderboard_arabic_mmlu_high_school_computer_science_light + - arabic_leaderboard_arabic_mmlu_high_school_european_history_light + - arabic_leaderboard_arabic_mmlu_high_school_geography_light + - arabic_leaderboard_arabic_mmlu_high_school_government_and_politics_light + - arabic_leaderboard_arabic_mmlu_high_school_macroeconomics_light + - arabic_leaderboard_arabic_mmlu_high_school_mathematics_light + - arabic_leaderboard_arabic_mmlu_high_school_microeconomics_light + - arabic_leaderboard_arabic_mmlu_high_school_physics_light + - arabic_leaderboard_arabic_mmlu_high_school_psychology_light + - arabic_leaderboard_arabic_mmlu_high_school_statistics_light + - arabic_leaderboard_arabic_mmlu_high_school_us_history_light + - arabic_leaderboard_arabic_mmlu_high_school_world_history_light + - arabic_leaderboard_arabic_mmlu_human_aging_light + - arabic_leaderboard_arabic_mmlu_human_sexuality_light + - arabic_leaderboard_arabic_mmlu_international_law_light + - arabic_leaderboard_arabic_mmlu_jurisprudence_light + - arabic_leaderboard_arabic_mmlu_logical_fallacies_light + - arabic_leaderboard_arabic_mmlu_machine_learning_light + - arabic_leaderboard_arabic_mmlu_management_light + - arabic_leaderboard_arabic_mmlu_marketing_light + - arabic_leaderboard_arabic_mmlu_medical_genetics_light + - arabic_leaderboard_arabic_mmlu_miscellaneous_light + - arabic_leaderboard_arabic_mmlu_moral_disputes_light + - arabic_leaderboard_arabic_mmlu_moral_scenarios_light + - arabic_leaderboard_arabic_mmlu_nutrition_light + - arabic_leaderboard_arabic_mmlu_philosophy_light + - arabic_leaderboard_arabic_mmlu_prehistory_light + - arabic_leaderboard_arabic_mmlu_professional_accounting_light + - arabic_leaderboard_arabic_mmlu_professional_law_light + - arabic_leaderboard_arabic_mmlu_professional_medicine_light + - arabic_leaderboard_arabic_mmlu_professional_psychology_light + - arabic_leaderboard_arabic_mmlu_public_relations_light + - arabic_leaderboard_arabic_mmlu_security_studies_light + - arabic_leaderboard_arabic_mmlu_sociology_light + - arabic_leaderboard_arabic_mmlu_us_foreign_policy_light + - arabic_leaderboard_arabic_mmlu_virology_light + - arabic_leaderboard_arabic_mmlu_world_religions_light +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_logical_fallacies_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_logical_fallacies_light.yaml new file mode 100644 index 0000000000..866420ba28 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_logical_fallacies_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_logical_fallacies_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: logical_fallacies +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_machine_learning_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_machine_learning_light.yaml new file mode 100644 index 0000000000..01ed181e01 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_machine_learning_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_machine_learning_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: machine_learning +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_management_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_management_light.yaml new file mode 100644 index 0000000000..62d7e32ab0 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_management_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_management_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: management +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_marketing_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_marketing_light.yaml new file mode 100644 index 0000000000..c42f7a177b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_marketing_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_marketing_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: marketing +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_medical_genetics_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_medical_genetics_light.yaml new file mode 100644 index 0000000000..40d0d88326 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_medical_genetics_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_medical_genetics_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: medical_genetics +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_miscellaneous_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_miscellaneous_light.yaml new file mode 100644 index 0000000000..06bc6a4715 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_miscellaneous_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_miscellaneous_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: miscellaneous +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_disputes_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_disputes_light.yaml new file mode 100644 index 0000000000..be0c60e631 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_disputes_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_moral_disputes_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: moral_disputes +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_scenarios_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_scenarios_light.yaml new file mode 100644 index 0000000000..08e71366d1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_moral_scenarios_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_moral_scenarios_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: moral_scenarios +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_nutrition_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_nutrition_light.yaml new file mode 100644 index 0000000000..7987f5f36c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_nutrition_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_nutrition_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: nutrition +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_philosophy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_philosophy_light.yaml new file mode 100644 index 0000000000..85ebdd7a4f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_philosophy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_philosophy_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: philosophy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_prehistory_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_prehistory_light.yaml new file mode 100644 index 0000000000..24aa8e22fe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_prehistory_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_prehistory_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: prehistory +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_accounting_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_accounting_light.yaml new file mode 100644 index 0000000000..1dc009663c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_accounting_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_accounting_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: professional_accounting +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_law_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_law_light.yaml new file mode 100644 index 0000000000..6e8c3617db --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_law_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_law_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: professional_law +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_medicine_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_medicine_light.yaml new file mode 100644 index 0000000000..b90cdb38d8 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_medicine_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_medicine_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: professional_medicine +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_psychology_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_psychology_light.yaml new file mode 100644 index 0000000000..420a536243 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_professional_psychology_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_professional_psychology_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: professional_psychology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_public_relations_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_public_relations_light.yaml new file mode 100644 index 0000000000..83d267bc08 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_public_relations_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_public_relations_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: public_relations +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_security_studies_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_security_studies_light.yaml new file mode 100644 index 0000000000..03e05d66e7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_security_studies_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_security_studies_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: security_studies +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_sociology_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_sociology_light.yaml new file mode 100644 index 0000000000..7deb088396 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_sociology_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_sociology_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: sociology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_us_foreign_policy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_us_foreign_policy_light.yaml new file mode 100644 index 0000000000..6c5f40a55e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_us_foreign_policy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_us_foreign_policy_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: us_foreign_policy +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_virology_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_virology_light.yaml new file mode 100644 index 0000000000..5ee4a7c95b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_virology_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_virology_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: virology +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_world_religions_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_world_religions_light.yaml new file mode 100644 index 0000000000..57b13f05b8 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/arabic_leaderboard_arabic_mmlu_world_religions_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_arabic_mmlu_world_religions_light +dataset_path: arcee-globe/Arabic_MMLU-10percent +dataset_name: world_religions +output_type: multiple_choice +training_split: null +validation_split: dev +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: dev +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/utils.py new file mode 100644 index 0000000000..da927b66fc --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mmlu_light/utils.py @@ -0,0 +1,35 @@ +import datasets +import numpy as np + + +# fmt: off +LETTER_INDICES_AR = ["أ", "ب", "ج", "د", "هـ", "و", "ز", "ح", "ط", "ي", "ك", "ل", "م", "ن", "س", "ع", "ف", "ص", "ق", "ر", "ش", "ت", "ث", "خ", "ذ", "ض", "ظ", "غ"] +# fmt: on + + +# fmt: off +LETTER_INDICES = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"] +# fmt: on + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + topic = doc["subject"] + instruction = f"الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح حول {topic.replace('_', ' ')}. \n\n" + choices = [doc["A"], doc["B"], doc["C"], doc["D"]] + # Answers are provided with roman letters - we look for the correct index in LETTER_INDICES, + # it will then be applied to arabic letters + gold_ix = LETTER_INDICES.index(doc["answer"]) + + query = f"{instruction}{doc['question']}\n" + query += "".join( + [ + f"{key}. {choice}\n" + for key, choice in zip(LETTER_INDICES_AR[:4], choices) + ] + ) + query += "الإجابة:" + + return {"query": query, "choices": LETTER_INDICES_AR[:4], "gold": gold_ix} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_leaderboard_arabic_mt_arc_challenge_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_leaderboard_arabic_mt_arc_challenge_light.yaml new file mode 100644 index 0000000000..a88bd6bd9e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_leaderboard_arabic_mt_arc_challenge_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_arc_challenge_light +task: + - arabic_mt_arc_challenge_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_mt_arc_challenge_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_mt_arc_challenge_light.yaml new file mode 100644 index 0000000000..e6b299e846 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/arabic_mt_arc_challenge_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_arc_challenge_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: arc_challenge_okapi_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_challenge_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_leaderboard_arabic_mt_arc_easy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_leaderboard_arabic_mt_arc_easy_light.yaml new file mode 100644 index 0000000000..618b542952 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_leaderboard_arabic_mt_arc_easy_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_arc_easy_light +task: + - arabic_mt_arc_easy_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_mt_arc_easy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_mt_arc_easy_light.yaml new file mode 100644 index 0000000000..90252fb31d --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/arabic_mt_arc_easy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_arc_easy_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: arc_easy_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_arc_easy_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_leaderboard_arabic_mt_boolq_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_leaderboard_arabic_mt_boolq_light.yaml new file mode 100644 index 0000000000..ee02f9cbc9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_leaderboard_arabic_mt_boolq_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_boolq_light +task: + - arabic_mt_boolq_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_mt_boolq_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_mt_boolq_light.yaml new file mode 100644 index 0000000000..4bdd145ce6 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/arabic_mt_boolq_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_boolq_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: boolq_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/utils.py new file mode 100644 index 0000000000..dcbc10d92e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_boolq_light/utils.py @@ -0,0 +1,24 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["question"] + passage = doc["passage"] + instruction = "بناء على المقطع التالي، أجب عن السؤال ب نعم أو لا" + query = f"""{instruction} + المقطع : + {passage} + السؤال: + {question} + الإجابة: + """ + + return { + "query": query, + "choices": ["نعم", "لا"], + "gold": 0 if doc["answer"] else 1, + } + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arabic_mt_copa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arabic_mt_copa_light.yaml new file mode 100644 index 0000000000..0ca475e735 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arabic_mt_copa_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_copa_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: copa_ext_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arbic_leaderboard_arabic_mt_copa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arbic_leaderboard_arabic_mt_copa_light.yaml new file mode 100644 index 0000000000..f3ea35bc50 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/arbic_leaderboard_arabic_mt_copa_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_copa_light +task: + - arabic_mt_copa_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/utils.py new file mode 100644 index 0000000000..175ebdadc1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_copa_light/utils.py @@ -0,0 +1,19 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + premise = doc["premise"] + choices = [doc["choice1"], doc["choice2"]] + question_map = {"cause": "لأن", "effect": "لذلك"} + question = question_map[doc["question"]] + answer = doc["label"] + + query = "{}، {} :\n0) {}\n1) {}\nالإجابة:".format( + premise, question, choices[0], choices[1] + ) + + return {"query": query, "choices": choices, "gold": answer} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_leaderboard_arabic_mt_hellaswag_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_leaderboard_arabic_mt_hellaswag_light.yaml new file mode 100644 index 0000000000..0f44bbbc75 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_leaderboard_arabic_mt_hellaswag_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_hellaswag_light +task: + - arabic_mt_hellaswag_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_mt_hellaswag_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_mt_hellaswag_light.yaml new file mode 100644 index 0000000000..56ea04f248 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/arabic_mt_hellaswag_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_hellaswag_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: hellaswag_okapi_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/utils.py new file mode 100644 index 0000000000..6b5a9f1f4f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_hellaswag_light/utils.py @@ -0,0 +1,30 @@ +import re + +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + ctx = re.sub(r"\[.*?\]", "", doc["ctx"]) # Remove latin words within brackets + endings = [ + re.sub(r"\[.*?\]", "", e) for e in eval(doc["endings"]) + ] # endings is a string representation of a list + answer_index = doc["label"] + instruction = ( + "بناء على السياق التالي، اختر النهاية الصحيحة من الاقتراحات التالية" + ) + + query = f"""{instruction} + السياق: + {ctx} + الاقتراحات: + + """ + for i, ending in enumerate(endings): + query += f"{i}) {ending}\n" + query += "الإجابة:" + + return {"query": query, "choices": endings, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_leaderboard_arabic_mt_mmlu_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_leaderboard_arabic_mt_mmlu_light.yaml new file mode 100644 index 0000000000..b95ca1b531 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_leaderboard_arabic_mt_mmlu_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_mmlu_light +task: + - arabic_mt_mmlu_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_mt_mmlu_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_mt_mmlu_light.yaml new file mode 100644 index 0000000000..43084db30b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/arabic_mt_mmlu_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_mmlu_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: mmlu_okapi_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_mmlu_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_leaderboard_arabic_mt_openbook_qa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_leaderboard_arabic_mt_openbook_qa_light.yaml new file mode 100644 index 0000000000..3737f621fb --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_leaderboard_arabic_mt_openbook_qa_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_openbook_qa_light +task: + - arabic_mt_openbook_qa_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_mt_openbook_qa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_mt_openbook_qa_light.yaml new file mode 100644 index 0000000000..5e914fbd32 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/arabic_mt_openbook_qa_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_openbook_qa_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: openbook_qa_ext_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_openbook_qa_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_leaderboard_arabic_mt_piqa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_leaderboard_arabic_mt_piqa_light.yaml new file mode 100644 index 0000000000..642b2e0a60 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_leaderboard_arabic_mt_piqa_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_piqa_light +task: + - arabic_mt_piqa_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_mt_piqa_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_mt_piqa_light.yaml new file mode 100644 index 0000000000..4dd9e005a9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/arabic_mt_piqa_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_piqa_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: piqa_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_piqa_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_leaderboard_arabic_mt_race_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_leaderboard_arabic_mt_race_light.yaml new file mode 100644 index 0000000000..8f427484d1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_leaderboard_arabic_mt_race_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_race_light +task: + - arabic_mt_race_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_mt_race_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_mt_race_light.yaml new file mode 100644 index 0000000000..fed452cce6 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/arabic_mt_race_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_race_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: race_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/utils.py new file mode 100644 index 0000000000..62f9874e63 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_race_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["query"] + answer_index = int(doc["label"]) + # Dynamically determining the choices by excluding '__few_shots', 'query' and 'label' + choices_keys = [ + key for key in doc.keys() if key not in ["query", "label", "__few_shots"] + ] + choices = [doc[key] for key in choices_keys] + + instruction = "الأسئلة التالية هي أسئلة متعددة الإختيارات مع الجواب الصحيح\n\n" + query = f"{instruction}السؤال: {question}\n" + for index, choice in enumerate(choices): + query += f"{index}) {choice}\n" + query += "الإجابة:" + + return {"query": query, "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_leaderboard_arabic_mt_sciq_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_leaderboard_arabic_mt_sciq_light.yaml new file mode 100644 index 0000000000..13127e9915 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_leaderboard_arabic_mt_sciq_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_sciq_light +task: + - arabic_mt_sciq_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_mt_sciq_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_mt_sciq_light.yaml new file mode 100644 index 0000000000..95976cbb7f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/arabic_mt_sciq_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_sciq_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: sciq_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/utils.py new file mode 100644 index 0000000000..ddb42eeb8c --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_sciq_light/utils.py @@ -0,0 +1,41 @@ +import random + +import datasets +import numpy as np + + +def doc_to_text(doc): + instruction = ( + "بناءً على السياق أدناه، اختر الإجابة الصحيحة للسؤال التالي من قائمة الاقتراحات" + ) + support = doc["support"] + question = doc["question"] + query = f"""{instruction} + السياق: + {support} + السؤال: + {question} + الإجابات المحتملة: + + """ + return query + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + correct_answer = doc["correct_answer"] + choices = [ + doc["distractor1"], + doc["distractor2"], + doc["distractor3"], + correct_answer, + ] + + # Shuffle the choices + random.shuffle(choices) + + answer_index = choices.index(correct_answer) + + return {"query": doc_to_text(doc), "choices": choices, "gold": answer_index} + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_leaderboard_arabic_mt_toxigen_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_leaderboard_arabic_mt_toxigen_light.yaml new file mode 100644 index 0000000000..7e305d5496 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_leaderboard_arabic_mt_toxigen_light.yaml @@ -0,0 +1,13 @@ +group: arabic_leaderboard_arabic_mt_toxigen_light +task: + - arabic_mt_toxigen_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_mt_toxigen_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_mt_toxigen_light.yaml new file mode 100644 index 0000000000..b2bef8abae --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/arabic_mt_toxigen_light.yaml @@ -0,0 +1,23 @@ +task: arabic_mt_toxigen_light +dataset_path: arcee-globe/AlGhafa-Arabic-LLM-Benchmark-Translated-10percent +dataset_name: toxigen_ar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/utils.py new file mode 100644 index 0000000000..09d311e5a1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_arabic_mt_toxigen_light/utils.py @@ -0,0 +1,23 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + text = doc["text"] + label = 1 if ((doc["toxicity_ai"] + doc["toxicity_human"]) > 5.5) else 0 + instruction = 'هل العبارة التالية تحض على الكراهية؟ قم بالرد بـ"نعم" أو "لا".' + + query = f"""{instruction} + العبارة: + '{text}' + الإجابة: + """ + out_doc = { + "query": query, + "choices": ["لا", "نعم"], + "gold": label, + } + return out_doc + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Algeria_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Algeria_light.yaml new file mode 100644 index 0000000000..4ab4634f60 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Algeria_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Algeria_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Algeria +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Ancient_Egypt_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Ancient_Egypt_light.yaml new file mode 100644 index 0000000000..ab6fffedc1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Ancient_Egypt_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Ancient_Egypt_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Ancient_Egypt +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arab_Empire_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arab_Empire_light.yaml new file mode 100644 index 0000000000..886574ebf2 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arab_Empire_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arab_Empire_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arab_Empire +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Architecture_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Architecture_light.yaml new file mode 100644 index 0000000000..e57472ad6e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Architecture_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Architecture_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Architecture +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Art_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Art_light.yaml new file mode 100644 index 0000000000..e94340e755 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Art_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Art_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Art +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Astronomy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Astronomy_light.yaml new file mode 100644 index 0000000000..e8ed990d52 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Astronomy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Astronomy_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Astronomy +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Calligraphy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Calligraphy_light.yaml new file mode 100644 index 0000000000..cd41bdde6a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Calligraphy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Calligraphy_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Calligraphy +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ceremony_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ceremony_light.yaml new file mode 100644 index 0000000000..72c6705479 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ceremony_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Ceremony_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Ceremony +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Clothing_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Clothing_light.yaml new file mode 100644 index 0000000000..9348de07f7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Clothing_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Clothing_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Clothing +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Culture_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Culture_light.yaml new file mode 100644 index 0000000000..4f211064d6 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Culture_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Culture_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Culture +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Food_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Food_light.yaml new file mode 100644 index 0000000000..7ccef6746f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Food_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Food_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Food +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Funeral_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Funeral_light.yaml new file mode 100644 index 0000000000..941154787b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Funeral_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Funeral_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Funeral +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Geography_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Geography_light.yaml new file mode 100644 index 0000000000..36221d8899 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Geography_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Geography_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Geography +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_History_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_History_light.yaml new file mode 100644 index 0000000000..2e12831816 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_History_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_History_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_History +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Language_Origin_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Language_Origin_light.yaml new file mode 100644 index 0000000000..8060604355 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Language_Origin_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Language_Origin_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Language_Origin +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Literature_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Literature_light.yaml new file mode 100644 index 0000000000..3122e39531 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Literature_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Literature_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Literature +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Math_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Math_light.yaml new file mode 100644 index 0000000000..0182aedac7 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Math_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Math_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Math +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Medicine_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Medicine_light.yaml new file mode 100644 index 0000000000..aec88febf1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Medicine_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Medicine_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Medicine +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Music_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Music_light.yaml new file mode 100644 index 0000000000..35a771898a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Music_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Music_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Music +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ornament_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ornament_light.yaml new file mode 100644 index 0000000000..6b31186cd6 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Ornament_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Ornament_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Ornament +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Philosophy_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Philosophy_light.yaml new file mode 100644 index 0000000000..f6b5fa71f1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Philosophy_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Philosophy_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Philosophy +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light.yaml new file mode 100644 index 0000000000..559d729c9b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Physics_and_Chemistry +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Wedding_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Wedding_light.yaml new file mode 100644 index 0000000000..9241709c13 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Arabic_Wedding_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Arabic_Wedding_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Arabic_Wedding +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Bahrain_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Bahrain_light.yaml new file mode 100644 index 0000000000..b9c7cef57a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Bahrain_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Bahrain_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Bahrain +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Comoros_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Comoros_light.yaml new file mode 100644 index 0000000000..1f74bd46c5 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Comoros_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Comoros_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Comoros +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Egypt_modern_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Egypt_modern_light.yaml new file mode 100644 index 0000000000..e0b19cff58 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Egypt_modern_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Egypt_modern_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Egypt_modern +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromAncientEgypt_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromAncientEgypt_light.yaml new file mode 100644 index 0000000000..6cf755a2a8 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromAncientEgypt_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromAncientEgypt_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: InfluenceFromAncientEgypt +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromByzantium_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromByzantium_light.yaml new file mode 100644 index 0000000000..8fe285eb12 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromByzantium_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromByzantium_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: InfluenceFromByzantium +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromChina_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromChina_light.yaml new file mode 100644 index 0000000000..bb028b0892 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromChina_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromChina_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: InfluenceFromChina +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromGreece_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromGreece_light.yaml new file mode 100644 index 0000000000..25060acc1a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromGreece_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromGreece_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: InfluenceFromGreece +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromIslam_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromIslam_light.yaml new file mode 100644 index 0000000000..0a60a2f3f0 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromIslam_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromIslam_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: InfluenceFromIslam +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromPersia_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromPersia_light.yaml new file mode 100644 index 0000000000..7081bec227 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromPersia_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromPersia_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: InfluenceFromPersia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromRome_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromRome_light.yaml new file mode 100644 index 0000000000..8c64cf3bbe --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_InfluenceFromRome_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_InfluenceFromRome_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: InfluenceFromRome +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Iraq_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Iraq_light.yaml new file mode 100644 index 0000000000..a056a9cf04 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Iraq_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Iraq_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Iraq +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_Education_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_Education_light.yaml new file mode 100644 index 0000000000..e8f6ad45d9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_Education_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Islam_Education_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Islam_Education +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_branches_and_schools_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_branches_and_schools_light.yaml new file mode 100644 index 0000000000..98137e9a3a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islam_branches_and_schools_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Islam_branches_and_schools_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Islam_branches_and_schools +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islamic_law_system_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islamic_law_system_light.yaml new file mode 100644 index 0000000000..d9aff345da --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Islamic_law_system_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Islamic_law_system_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Islamic_law_system +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Jordan_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Jordan_light.yaml new file mode 100644 index 0000000000..674a998e01 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Jordan_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Jordan_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Jordan +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Kuwait_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Kuwait_light.yaml new file mode 100644 index 0000000000..0c3d372d9e --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Kuwait_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Kuwait_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Kuwait +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Lebanon_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Lebanon_light.yaml new file mode 100644 index 0000000000..9c3856d698 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Lebanon_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Lebanon_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Lebanon +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Libya_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Libya_light.yaml new file mode 100644 index 0000000000..6070ccbfb8 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Libya_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Libya_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Libya +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mauritania_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mauritania_light.yaml new file mode 100644 index 0000000000..0b1deda614 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mauritania_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Mauritania_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Mauritania +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mesopotamia_civilization_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mesopotamia_civilization_light.yaml new file mode 100644 index 0000000000..65474b724b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Mesopotamia_civilization_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Mesopotamia_civilization_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Mesopotamia_civilization +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Morocco_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Morocco_light.yaml new file mode 100644 index 0000000000..d752434a5a --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Morocco_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Morocco_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Morocco +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Oman_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Oman_light.yaml new file mode 100644 index 0000000000..448498f4a1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Oman_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Oman_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Oman +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Palestine_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Palestine_light.yaml new file mode 100644 index 0000000000..a619c460a1 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Palestine_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Palestine_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Palestine +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Qatar_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Qatar_light.yaml new file mode 100644 index 0000000000..967dbc57ef --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Qatar_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Qatar_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Qatar +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Saudi_Arabia_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Saudi_Arabia_light.yaml new file mode 100644 index 0000000000..d45558b9ff --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Saudi_Arabia_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Saudi_Arabia_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Saudi_Arabia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Somalia_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Somalia_light.yaml new file mode 100644 index 0000000000..558ea176a3 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Somalia_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Somalia_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Somalia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Sudan_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Sudan_light.yaml new file mode 100644 index 0000000000..ce59973306 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Sudan_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Sudan_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Sudan +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Syria_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Syria_light.yaml new file mode 100644 index 0000000000..8b0bd7aebc --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Syria_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Syria_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Syria +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Tunisia_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Tunisia_light.yaml new file mode 100644 index 0000000000..a53c5e0bf9 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Tunisia_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Tunisia_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: Tunisia +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_United_Arab_Emirates_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_United_Arab_Emirates_light.yaml new file mode 100644 index 0000000000..1ce5993a67 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_United_Arab_Emirates_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_United_Arab_Emirates_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: United_Arab_Emirates +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Yemen_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Yemen_light.yaml new file mode 100644 index 0000000000..e480b19d60 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_Yemen_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_Yemen_light +dataset_path: OALL/ACVA +dataset_name: Yemen +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_communication_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_communication_light.yaml new file mode 100644 index 0000000000..2814278ace --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_communication_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_communication_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: communication +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_computer_and_phone_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_computer_and_phone_light.yaml new file mode 100644 index 0000000000..ddd07e3f50 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_computer_and_phone_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_computer_and_phone_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: computer_and_phone +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_daily_life_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_daily_life_light.yaml new file mode 100644 index 0000000000..2d975e4e85 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_daily_life_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_daily_life_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: daily_life +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_entertainment_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_entertainment_light.yaml new file mode 100644 index 0000000000..721e6cdd3b --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_entertainment_light.yaml @@ -0,0 +1,23 @@ +task: arabic_leaderboard_acva_entertainment_light +dataset_path: arcee-globe/ACVA-10percent +dataset_name: entertainment +output_type: multiple_choice +training_split: null +validation_split: validation +test_split: test +process_docs: !function utils.process_docs +doc_to_text: "{{query}}" +doc_to_target: "{{gold}}" +doc_to_choice: "choices" +fewshot_split: validation +fewshot_config: + sampler: first_n +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_light.yaml new file mode 100644 index 0000000000..ea4a89771f --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/arabic_leaderboard_acva_light.yaml @@ -0,0 +1,70 @@ +group: arabic_leaderboard_acva_light +task: + - arabic_leaderboard_acva_Algeria_light + - arabic_leaderboard_acva_Ancient_Egypt_light + - arabic_leaderboard_acva_Arab_Empire_light + - arabic_leaderboard_acva_Arabic_Architecture_light + - arabic_leaderboard_acva_Arabic_Art_light + - arabic_leaderboard_acva_Arabic_Astronomy_light + - arabic_leaderboard_acva_Arabic_Calligraphy_light + - arabic_leaderboard_acva_Arabic_Ceremony_light + - arabic_leaderboard_acva_Arabic_Clothing_light + - arabic_leaderboard_acva_Arabic_Culture_light + - arabic_leaderboard_acva_Arabic_Food_light + - arabic_leaderboard_acva_Arabic_Funeral_light + - arabic_leaderboard_acva_Arabic_Geography_light + - arabic_leaderboard_acva_Arabic_History_light + - arabic_leaderboard_acva_Arabic_Language_Origin_light + - arabic_leaderboard_acva_Arabic_Literature_light + - arabic_leaderboard_acva_Arabic_Math_light + - arabic_leaderboard_acva_Arabic_Medicine_light + - arabic_leaderboard_acva_Arabic_Music_light + - arabic_leaderboard_acva_Arabic_Ornament_light + - arabic_leaderboard_acva_Arabic_Philosophy_light + - arabic_leaderboard_acva_Arabic_Physics_and_Chemistry_light + - arabic_leaderboard_acva_Arabic_Wedding_light + - arabic_leaderboard_acva_Bahrain_light + - arabic_leaderboard_acva_Comoros_light + - arabic_leaderboard_acva_Egypt_modern_light + - arabic_leaderboard_acva_InfluenceFromAncientEgypt_light + - arabic_leaderboard_acva_InfluenceFromByzantium_light + - arabic_leaderboard_acva_InfluenceFromChina_light + - arabic_leaderboard_acva_InfluenceFromGreece_light + - arabic_leaderboard_acva_InfluenceFromIslam_light + - arabic_leaderboard_acva_InfluenceFromPersia_light + - arabic_leaderboard_acva_InfluenceFromRome_light + - arabic_leaderboard_acva_Iraq_light + - arabic_leaderboard_acva_Islam_Education_light + - arabic_leaderboard_acva_Islam_branches_and_schools_light + - arabic_leaderboard_acva_Islamic_law_system_light + - arabic_leaderboard_acva_Jordan_light + - arabic_leaderboard_acva_Kuwait_light + - arabic_leaderboard_acva_Lebanon_light + - arabic_leaderboard_acva_Libya_light + - arabic_leaderboard_acva_Mauritania_light + - arabic_leaderboard_acva_Mesopotamia_civilization_light + - arabic_leaderboard_acva_Morocco_light + - arabic_leaderboard_acva_Oman_light + - arabic_leaderboard_acva_Palestine_light + - arabic_leaderboard_acva_Qatar_light + - arabic_leaderboard_acva_Saudi_Arabia_light + - arabic_leaderboard_acva_Somalia_light + - arabic_leaderboard_acva_Sudan_light + - arabic_leaderboard_acva_Syria_light + - arabic_leaderboard_acva_Tunisia_light + - arabic_leaderboard_acva_United_Arab_Emirates_light + - arabic_leaderboard_acva_Yemen_light + - arabic_leaderboard_acva_communication_light + - arabic_leaderboard_acva_computer_and_phone_light + - arabic_leaderboard_acva_daily_life_light + - arabic_leaderboard_acva_entertainment_light + +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/utils.py b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/utils.py new file mode 100644 index 0000000000..7e91496f59 --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_avca_light/utils.py @@ -0,0 +1,16 @@ +import datasets +import numpy as np + + +def process_docs(dataset: datasets.Dataset): + def _process_doc(doc): + question = doc["question"] + answer = doc["answer"] + + return { + "query": f"السؤال: {question}\nالإجابة:", + "choices": ["صح", "خطأ"], + "gold": ["صح", "خطأ"].index(answer), + } + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_light.yaml b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_light.yaml new file mode 100644 index 0000000000..d77ebd1eeb --- /dev/null +++ b/lm_eval/tasks/arabic_leaderboard_light/arabic_leaderboard_light.yaml @@ -0,0 +1,25 @@ +group: arabic_leaderboard_light +task: + - arabic_leaderboard_acva_light + - arabic_leaderboard_alghafa_light + - arabic_leaderboard_arabic_exams_light + - arabic_leaderboard_arabic_mt_arc_challenge_light + - arabic_leaderboard_arabic_mt_arc_easy_light + - arabic_leaderboard_arabic_mt_boolq_light + - arabic_leaderboard_arabic_mt_hellaswag_light + - arabic_leaderboard_arabic_mt_mmlu_light + - arabic_leaderboard_arabic_mt_copa_light + - arabic_leaderboard_arabic_mt_openbook_qa_light + - arabic_leaderboard_arabic_mt_piqa_light + - arabic_leaderboard_arabic_mt_race_light + - arabic_leaderboard_arabic_mt_sciq_light + - arabic_leaderboard_arabic_mt_toxigen_light +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true + - metric: acc_norm + aggregation: mean + weight_by_size: true +metadata: + version: 1.0