-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluate.py
More file actions
125 lines (107 loc) · 4.04 KB
/
Copy pathevaluate.py
File metadata and controls
125 lines (107 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
RAGAS Evaluation Script
────────────────────────
Evaluates the RAG pipeline on a small QA dataset and prints metrics.
Usage:
python evaluate.py
Metrics produced:
- faithfulness (are answers grounded in the retrieved context?)
- answer_relevancy (does the answer address the question?)
- context_precision (are retrieved chunks actually relevant?)
- context_recall (did we retrieve all necessary information?)
"""
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
context_recall,
)
from data_loader import load_documents, split_documents
from vector_store import build_vector_store, get_retriever
from rag import build_rag_chain, ask
# ---------------------------------------------------------------------------
# Evaluation dataset
# Update these Q&A pairs to match YOUR article content
# ---------------------------------------------------------------------------
EVAL_DATA = [
{
"question": "What is retrieval-augmented generation?",
"ground_truth": (
"Retrieval-Augmented Generation (RAG) is a technique that combines "
"a retrieval system with a language model. The retrieval system "
"fetches relevant documents from a knowledge base, and the language "
"model uses that context to generate grounded, accurate answers."
),
},
{
"question": "What are the main components of a RAG system?",
"ground_truth": (
"A RAG system consists of three main components: a document store "
"or knowledge base, a retriever that finds relevant chunks using "
"vector similarity search, and a language model that generates "
"answers conditioned on the retrieved context."
),
},
{
"question": "What is a vector database used for in RAG?",
"ground_truth": (
"A vector database stores document embeddings and enables fast "
"similarity search. When a user asks a question, the query is "
"embedded and compared against stored vectors to retrieve the most "
"semantically relevant document chunks."
),
},
]
def run_evaluation() -> None:
print("🔄 Indexing documents for evaluation...")
docs = load_documents()
chunks = split_documents(docs)
vector_store = build_vector_store(chunks)
retriever = get_retriever(vector_store)
chain = build_rag_chain(retriever)
print("🔄 Running RAG pipeline on evaluation questions...")
questions, answers, contexts, ground_truths = [], [], [], []
for item in EVAL_DATA:
result = ask(item["question"], retriever, chain)
retrieved_docs = retriever.invoke(item["question"])
questions.append(item["question"])
answers.append(result.answer)
contexts.append([doc.page_content for doc in retrieved_docs])
ground_truths.append(item["ground_truth"])
print(f" ✓ Q: {item['question'][:60]}...")
dataset = Dataset.from_dict(
{
"question": questions,
"answer": answers,
"contexts": contexts,
"ground_truth": ground_truths,
}
)
print("\n📊 Running RAGAS evaluation...\n")
results = evaluate(
dataset=dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
context_recall,
],
)
# Pretty-print results
print("=" * 50)
print(" RAGAS EVALUATION RESULTS")
print("=" * 50)
scores = results.to_pandas()
for metric in ["faithfulness", "answer_relevancy",
"context_precision", "context_recall"]:
if metric in scores.columns:
mean_score = scores[metric].mean()
bar = "█" * int(mean_score * 20)
print(f" {metric:<22} {mean_score:.3f} {bar}")
print("=" * 50)
print()
return results
if __name__ == "__main__":
run_evaluation()