from typing import Any, Dict, List, Optional
import logging
try:
from ragas import evaluate as ragas_evaluate
from ragas.metrics import (
context_precision,
context_recall,
faithfulness,
answer_relevancy,
)
from datasets import Dataset
RAGAS_AVAILABLE = True
except ImportError:
RAGAS_AVAILABLE = False
# Placeholders to prevent NameError in __init__ before check
ragas_evaluate = None
context_precision = None
context_recall = None
faithfulness = None
answer_relevancy = None
Dataset = None
from src.libs.evaluator.base_evaluator import BaseEvaluator
logger = logging.getLogger(__name__)
class RagasEvaluator(BaseEvaluator):
"""
Evaluator implementation using Ragas library.
Supports metrics: context_precision, context_recall, faithfulness, answer_relevancy.
"""
def __init__(self, metrics: Optional[List[str]] = None):
super().__init__()
if not RAGAS_AVAILABLE:
raise ImportError("ragas package is not installed. Please install it with `pip install ragas`.")
self.metrics_map = {
"context_precision": context_precision,
"context_recall": context_recall,
"faithfulness": faithfulness,
"answer_relevancy": answer_relevancy,
}
self.selected_metrics = []
if metrics:
for m in metrics:
if m in self.metrics_map:
self.selected_metrics.append(self.metrics_map[m])
else:
logger.warning(f"Metric {m} not supported by RagasEvaluator. Skipping.")
if not self.selected_metrics:
# Default to all if not specified
self.selected_metrics = list(self.metrics_map.values())
def evaluate(
self,
query: str,
retrieved_ids: List[str],
golden_ids: List[str],
trace: Optional[Any] = None,
**kwargs: Any,
) -> Dict[str, float]:
"""
Run Ragas evaluation.
Requires the following kwargs:
- retrieved_texts (List[str]): Content of retrieved chunks.
- generated_answer (str): Answer generated by the LLM (required for faithfulness, answer_relevancy).
- golden_answer (str): Ground truth answer (required for context_recall).
Note: retrieved_ids and golden_ids are ignored by Ragas metrics,
but kept for interface compatibility.
"""
retrieved_texts = kwargs.get("retrieved_texts", [])
generated_answer = kwargs.get("generated_answer", "")
golden_answer = kwargs.get("golden_answer", "")
# Prepare dataset
data = {
"question": [query],
"contexts": [retrieved_texts],
}
if generated_answer:
data["answer"] = [generated_answer]
if golden_answer:
data["ground_truth"] = [golden_answer]
# Filter metrics based on available data
active_metrics = []
for metric in self.selected_metrics:
# Check requirements for each metric
# This is a simplification; Ragas raises errors if columns missing
if metric.name in ["faithfulness", "answer_relevancy"] and not generated_answer:
logger.warning(f"Skipping {metric.name} due to missing generated_answer.")
continue
if metric.name in ["context_recall"] and not golden_answer:
logger.warning(f"Skipping {metric.name} due to missing golden_answer.")
continue
if metric.name in ["context_precision"] and not golden_answer:
# context_precision typically needs ground_truth (or at least relevant chunks)
logger.warning(f"Skipping {metric.name} due to missing golden_answer.")
continue
active_metrics.append(metric)
if not active_metrics:
logger.warning("No applicable metrics for the provided data.")
return {}
try:
dataset = Dataset.from_dict(data)
results = ragas_evaluate(dataset=dataset, metrics=active_metrics)
return dict(results)
except Exception as e:
logger.error(f"Ragas evaluation failed: {e}")
return {}