diff --git a/src/screens/ARB/Arb.tsx b/src/screens/ARB/Arb.tsx index 0a5670d..ef7581d 100644 --- a/src/screens/ARB/Arb.tsx +++ b/src/screens/ARB/Arb.tsx @@ -285,7 +285,7 @@ const Arb: React.FC = () => { Evaluation Results - Our evaluation of current large language models (LLMs) focuses on text-only problems, with no multimodal tasks, using models including ChatGPT, GPT 3.5, GPT-4, and Claude. Each question type is assessed with task-specific instructions and chain of thought; for multiple-choice questions, the model's choice is compared with the correct answer, while numerical, symbolic, and proof-like problems require extraction and parsing of the model's answer, often requiring mathematical libraries and manual grading due to their complexity. We also tested two model-based approaches for grading, including GPT-4's ability to grade equivalence of two symbolic expressions and a rubric-based evaluation method, which showed promising results, facilitating the evaluation of increasingly unstructured answers. + Our evaluation of current large language models (LLMs) focuses on text-only problems, with no multimodal tasks, using models including ChatGPT, GPT 3.5, GPT-4, and Claude. Each question type is assessed with task-specific instructions and chain of thought; for multiple-choice questions, the model's choice is compared with the correct answer, while numerical, symbolic, and proof-like problems require extraction and parsing of the model's answer, often requiring mathematical libraries and manual grading due to their complexity. We also tested two model-based approaches for grading, including GPT-4's ability to grade equivalence of two symbolic expressions and a rubric-based evaluation method, which showed promising results, facilitating the evaluation of increasingly unstructured answers.