diff --git a/book/llm/Chapter.ipynb b/book/llm/Chapter.ipynb index 692e48b..9e3303a 100644 --- a/book/llm/Chapter.ipynb +++ b/book/llm/Chapter.ipynb @@ -335,6 +335,67 @@ "generator = outlines.generate.choice(model, [\"Positive\", \"Negative\"])\n", "answer = generator(prompt)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluating RAG Pipelines with `Ragas`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How do you evaluate your RAG application?\n", + "\n", + "Sure, you can look manually over your responses and see if it's what you want.\n", + "\n", + "But, it's not scalable.\n", + "\n", + "Instead, use `Ragas` in Python.\n", + "\n", + "`Ragas` is a library providing evaluation techniques and metrics for your RAG pipeline like Context Precision/Recall, Faithfulness and answer relevancy.\n", + "\n", + "See below how easy it is to run `Ragas`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install ragas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from datasets import Dataset \n", + "import os\n", + "from ragas import evaluate\n", + "from ragas.metrics import faithfulness, answer_correctness\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"your-openai-key\"\n", + "\n", + "data_samples = {\n", + " 'question': ['When was the first super bowl?', 'Who won the most super bowls?'],\n", + " 'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],\n", + " 'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'], \n", + " ['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],\n", + " 'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']\n", + "}\n", + "\n", + "dataset = Dataset.from_dict(data_samples)\n", + "\n", + "score = evaluate(dataset,metrics=[faithfulness,answer_correctness])\n", + "score.to_pandas()" + ] } ], "metadata": {