diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 518975a9c1..6c500ee5da 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -69,7 +69,7 @@ | [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu | | [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English | | mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English | -| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English | +| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigorous. | English | | model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | | | [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English | | [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English | diff --git a/lm_eval/tasks/__init__.py b/lm_eval/tasks/__init__.py index b892717956..4cae685284 100644 --- a/lm_eval/tasks/__init__.py +++ b/lm_eval/tasks/__init__.py @@ -492,7 +492,7 @@ def _get_task_and_group(self, task_dir: str): "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. " "`tag` will be used to allow to call a collection of tasks just like `group`. " "`group` will be removed in order to not cause confusion with the new ConfigurableGroup " - "which will be the offical way to create groups with addition of group-wide configuations." + "which will be the official way to create groups with addition of group-wide configurations." ) print_info = False # attr = "tag" diff --git a/lm_eval/tasks/aclue/README.md b/lm_eval/tasks/aclue/README.md index 5e218e599d..6323ef7e3e 100644 --- a/lm_eval/tasks/aclue/README.md +++ b/lm_eval/tasks/aclue/README.md @@ -14,7 +14,7 @@ Homepage: https://github.com/isen-zhang/ACLUE ```bibtex @inproceedings{zhang-li-2023-large, - title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}", + title = "Can Large Language Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}", author = "Zhang, Yixuan and Li, Haonan", booktitle = "Proceedings of the Ancient Language Processing Workshop", month = sep, diff --git a/lm_eval/tasks/eq_bench/README.md b/lm_eval/tasks/eq_bench/README.md index 472890bdc8..df11108eec 100644 --- a/lm_eval/tasks/eq_bench/README.md +++ b/lm_eval/tasks/eq_bench/README.md @@ -16,8 +16,8 @@ Homepage: https://eqbench.com/ NOTE: There are some key differences between the lm-evaluation-harness version and the implementation described in the EQ-Bench paper (These have been OK'd by the author): - The lm-eval version uses the EQ-Bench v2 test set (171 questions) and score calculation. It does not incorporate the revision part of the prompt, as per v2.1 (https://github.com/EQ-bench/EQ-Bench) -- No retries in lm-eval version (EQ-Bench pipeline retries with successively higher temps if it encounters unparseable answers) -- In the original implementation, unparseable answers are excluded from the final score, and 83% of answers have to be parseable or a fail is returned. The lm-eval version instead assigns 0 to unparsable answers and has no fail criteria. So for lower performing models, there may be differences with the EQ-Bench leaderboard. +- No retries in lm-eval version (EQ-Bench pipeline retries with successively higher temps if it encounters unparsable answers) +- In the original implementation, unparsable answers are excluded from the final score, and 83% of answers have to be parseable or a fail is returned. The lm-eval version instead assigns 0 to unparsable answers and has no fail criteria. So for lower performing models, there may be differences with the EQ-Bench leaderboard. ### Citation diff --git a/lm_eval/tasks/ifeval/instructions.py b/lm_eval/tasks/ifeval/instructions.py index 5c352af3f7..a79cbba4f5 100644 --- a/lm_eval/tasks/ifeval/instructions.py +++ b/lm_eval/tasks/ifeval/instructions.py @@ -78,7 +78,7 @@ # The number of highlighted sections. _NUM_HIGHLIGHTED_SECTIONS = 4 -# The section spliter. +# The section splitter. _SECTION_SPLITER = ("Section", "SECTION") # The number of sections. @@ -153,7 +153,7 @@ def build_description(self, *, language=None): return self._description_pattern.format(language=_LANGUAGES[self._language]) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"language": self._language} def get_instruction_args_keys(self): @@ -223,7 +223,7 @@ def build_description(self, *, num_sentences=None, relation=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "num_sentences": self._num_sentences_threshold, "relation": self._comparison_relation, @@ -276,7 +276,7 @@ def build_description(self, *, num_placeholders=None): return self._description_pattern.format(num_placeholders=self._num_placeholders) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_placeholders": self._num_placeholders} def get_instruction_args_keys(self): @@ -323,7 +323,7 @@ def build_description(self, *, num_bullets=None): return self._description_pattern.format(num_bullets=self._num_bullets) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_bullets": self._num_bullets} def get_instruction_args_keys(self): @@ -362,7 +362,7 @@ def build_description(self): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return None def get_instruction_args_keys(self): @@ -393,7 +393,7 @@ def build_description(self, *, starter=None): """Build the instruction description. Args: - starter: A string representing the keyward that the response should start + starter: A string representing the keyword that the response should start with. Returns: @@ -409,7 +409,7 @@ def build_description(self, *, starter=None): return self._description_pattern.format(starter=self._starter) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"starter": self._starter} def get_instruction_args_keys(self): @@ -458,7 +458,7 @@ def build_description(self, *, num_highlights=None): return self._description_pattern.format(num_highlights=self._num_highlights) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_highlights": self._num_highlights} def get_instruction_args_keys(self): @@ -469,12 +469,12 @@ def check_following(self, value): """Checks if the number of highlighted sections meets the requirement. Args: - value: a string repesenting the response. The response is expected to + value: a string representing the response. The response is expected to contain highlighted sections in the format of *highlighted*. Returns: True if the actual number of highlighted sections in the format of - *highlighed sections* meets the minimum requirement; otherwise False. + *highlighted sections* meets the minimum requirement; otherwise False. """ num_highlights = 0 highlights = re.findall(r"\*[^\n\*]*\*", value) @@ -529,7 +529,7 @@ def build_description(self, *, section_spliter=None, num_sections=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "section_spliter": self._section_spliter, "num_sections": self._num_sections, @@ -582,7 +582,7 @@ def build_description(self, *, num_paragraphs=None): return self._description_pattern.format(num_paragraphs=self._num_paragraphs) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_paragraphs": self._num_paragraphs} def get_instruction_args_keys(self): @@ -642,7 +642,7 @@ def build_description(self, *, postscript_marker=None): return self._description_pattern.format(postscript=self._postscript_marker) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"postscript_marker": self._postscript_marker} def get_instruction_args_keys(self): @@ -672,7 +672,7 @@ def check_following(self, value): class RephraseChecker(Instruction): - """Checks the repharse.""" + """Checks the rephrase.""" def build_description(self, *, original_message): """Build the instruction description. @@ -701,7 +701,7 @@ def build_description(self, *, original_message): return self._description def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"original_message": self._reference_without_change} def get_instruction_args_keys(self): @@ -766,7 +766,7 @@ def build_description(self, *, keywords=None): return self._description_pattern.format(keywords=self._keywords) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"keywords": self._keywords} def get_instruction_args_keys(self): @@ -831,7 +831,7 @@ def build_description(self, *, keyword=None, frequency=None, relation=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "keyword": self._keyword, "frequency": self._frequency, @@ -894,7 +894,7 @@ def build_description(self, *, num_words=None, relation=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_words": self._num_words, "relation": self._comparison_relation} def get_instruction_args_keys(self): @@ -922,7 +922,7 @@ def build_description(self): return self._description_pattern def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return None def get_instruction_args_keys(self): @@ -996,7 +996,7 @@ def build_description( ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "num_paragraphs": self._num_paragraphs, "nth_paragraph": self._nth_paragraph, @@ -1089,7 +1089,7 @@ def build_description(self, key_sentences=None, num_sentences=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "num_sentences": self._num_sentences, "key_sentences": list(self._key_sentences), @@ -1117,7 +1117,7 @@ def build_description(self, forbidden_words=None): """Build the instruction description. Args: - forbidden_words: A sequences of strings respresenting words that are not + forbidden_words: A sequences of strings representing words that are not allowed in the response. Returns: @@ -1138,7 +1138,7 @@ def build_description(self, forbidden_words=None): return self._description_pattern.format(forbidden_words=self._forbidden_words) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"forbidden_words": self._forbidden_words} def get_instruction_args_keys(self): @@ -1188,7 +1188,7 @@ def build_description(self, *, original_paragraph, low, high): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "original_paragraph": self._original_paragraph, "low": self._low, @@ -1225,7 +1225,7 @@ def build_description(self): return self._description_pattern def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return None def get_instruction_args_keys(self): diff --git a/lm_eval/tasks/leaderboard/ifeval/instructions.py b/lm_eval/tasks/leaderboard/ifeval/instructions.py index 5c352af3f7..a79cbba4f5 100644 --- a/lm_eval/tasks/leaderboard/ifeval/instructions.py +++ b/lm_eval/tasks/leaderboard/ifeval/instructions.py @@ -78,7 +78,7 @@ # The number of highlighted sections. _NUM_HIGHLIGHTED_SECTIONS = 4 -# The section spliter. +# The section splitter. _SECTION_SPLITER = ("Section", "SECTION") # The number of sections. @@ -153,7 +153,7 @@ def build_description(self, *, language=None): return self._description_pattern.format(language=_LANGUAGES[self._language]) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"language": self._language} def get_instruction_args_keys(self): @@ -223,7 +223,7 @@ def build_description(self, *, num_sentences=None, relation=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "num_sentences": self._num_sentences_threshold, "relation": self._comparison_relation, @@ -276,7 +276,7 @@ def build_description(self, *, num_placeholders=None): return self._description_pattern.format(num_placeholders=self._num_placeholders) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_placeholders": self._num_placeholders} def get_instruction_args_keys(self): @@ -323,7 +323,7 @@ def build_description(self, *, num_bullets=None): return self._description_pattern.format(num_bullets=self._num_bullets) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_bullets": self._num_bullets} def get_instruction_args_keys(self): @@ -362,7 +362,7 @@ def build_description(self): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return None def get_instruction_args_keys(self): @@ -393,7 +393,7 @@ def build_description(self, *, starter=None): """Build the instruction description. Args: - starter: A string representing the keyward that the response should start + starter: A string representing the keyword that the response should start with. Returns: @@ -409,7 +409,7 @@ def build_description(self, *, starter=None): return self._description_pattern.format(starter=self._starter) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"starter": self._starter} def get_instruction_args_keys(self): @@ -458,7 +458,7 @@ def build_description(self, *, num_highlights=None): return self._description_pattern.format(num_highlights=self._num_highlights) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_highlights": self._num_highlights} def get_instruction_args_keys(self): @@ -469,12 +469,12 @@ def check_following(self, value): """Checks if the number of highlighted sections meets the requirement. Args: - value: a string repesenting the response. The response is expected to + value: a string representing the response. The response is expected to contain highlighted sections in the format of *highlighted*. Returns: True if the actual number of highlighted sections in the format of - *highlighed sections* meets the minimum requirement; otherwise False. + *highlighted sections* meets the minimum requirement; otherwise False. """ num_highlights = 0 highlights = re.findall(r"\*[^\n\*]*\*", value) @@ -529,7 +529,7 @@ def build_description(self, *, section_spliter=None, num_sections=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "section_spliter": self._section_spliter, "num_sections": self._num_sections, @@ -582,7 +582,7 @@ def build_description(self, *, num_paragraphs=None): return self._description_pattern.format(num_paragraphs=self._num_paragraphs) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_paragraphs": self._num_paragraphs} def get_instruction_args_keys(self): @@ -642,7 +642,7 @@ def build_description(self, *, postscript_marker=None): return self._description_pattern.format(postscript=self._postscript_marker) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"postscript_marker": self._postscript_marker} def get_instruction_args_keys(self): @@ -672,7 +672,7 @@ def check_following(self, value): class RephraseChecker(Instruction): - """Checks the repharse.""" + """Checks the rephrase.""" def build_description(self, *, original_message): """Build the instruction description. @@ -701,7 +701,7 @@ def build_description(self, *, original_message): return self._description def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"original_message": self._reference_without_change} def get_instruction_args_keys(self): @@ -766,7 +766,7 @@ def build_description(self, *, keywords=None): return self._description_pattern.format(keywords=self._keywords) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"keywords": self._keywords} def get_instruction_args_keys(self): @@ -831,7 +831,7 @@ def build_description(self, *, keyword=None, frequency=None, relation=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "keyword": self._keyword, "frequency": self._frequency, @@ -894,7 +894,7 @@ def build_description(self, *, num_words=None, relation=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"num_words": self._num_words, "relation": self._comparison_relation} def get_instruction_args_keys(self): @@ -922,7 +922,7 @@ def build_description(self): return self._description_pattern def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return None def get_instruction_args_keys(self): @@ -996,7 +996,7 @@ def build_description( ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "num_paragraphs": self._num_paragraphs, "nth_paragraph": self._nth_paragraph, @@ -1089,7 +1089,7 @@ def build_description(self, key_sentences=None, num_sentences=None): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "num_sentences": self._num_sentences, "key_sentences": list(self._key_sentences), @@ -1117,7 +1117,7 @@ def build_description(self, forbidden_words=None): """Build the instruction description. Args: - forbidden_words: A sequences of strings respresenting words that are not + forbidden_words: A sequences of strings representing words that are not allowed in the response. Returns: @@ -1138,7 +1138,7 @@ def build_description(self, forbidden_words=None): return self._description_pattern.format(forbidden_words=self._forbidden_words) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return {"forbidden_words": self._forbidden_words} def get_instruction_args_keys(self): @@ -1188,7 +1188,7 @@ def build_description(self, *, original_paragraph, low, high): ) def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return { "original_paragraph": self._original_paragraph, "low": self._low, @@ -1225,7 +1225,7 @@ def build_description(self): return self._description_pattern def get_instruction_args(self): - """Returns the keyward args of `build_description`.""" + """Returns the keyword args of `build_description`.""" return None def get_instruction_args_keys(self): diff --git a/lm_eval/tasks/scrolls/README.md b/lm_eval/tasks/scrolls/README.md index a90e00f4e7..b1245926e5 100644 --- a/lm_eval/tasks/scrolls/README.md +++ b/lm_eval/tasks/scrolls/README.md @@ -11,7 +11,7 @@ Homepage: https://www.scrolls-benchmark.com/ Since SCROLLS tasks are generally longer than the maximum sequence length of many models, it is possible to create "subset" tasks that contain only those samples whose tokenized length is less than some pre-defined limit. For example, to create a subset of "Qasper" that would -be suitable for a model using the GPTNeoX tokenizer and a 4K maximium sequence length: +be suitable for a model using the GPTNeoX tokenizer and a 4K maximum sequence length: ``` class QasperGPTNeoX4K(Qasper): diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index 45656be3e9..ac2fed25ae 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -439,7 +439,7 @@ class GovReport(_SCROLLSSummaryTask): Note: The average length of the reference summaries is ~3,000 characters, or ~600 tokens as tokenized by GPT-NeoX. For causal models, - it is recommended to set `max_gen_toks` sufficently large (e.g. 1024) + it is recommended to set `max_gen_toks` sufficiently large (e.g. 1024) to allow a full summary to be generated. """ diff --git a/lm_eval/tasks/tmmluplus/default/_generate_configs.py b/lm_eval/tasks/tmmluplus/default/_generate_configs.py index a3b1271a57..06ef7a710f 100644 --- a/lm_eval/tasks/tmmluplus/default/_generate_configs.py +++ b/lm_eval/tasks/tmmluplus/default/_generate_configs.py @@ -11,7 +11,7 @@ # Copy from https://github.com/iKala/ievals/blob/main/ievals/settings.py -# from TMMLU+ offical example +# from TMMLU+ official example categories = { "STEM": [ "physics",