generate_visual_report
Create visual HTML reports from evaluation results to analyze traces, diffs, scores, and timelines in your browser.
Instructions
Generate a beautiful self-contained HTML visual report from the latest evalview check or run results. Opens automatically in the browser. Call this after run_check or run_snapshot to give the user a visual breakdown of traces, diffs, scores, and timelines. Returns the absolute path to the generated HTML file.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| results_file | No | Path to a specific results JSON file. If omitted, uses the latest file in .evalview/results/. | |
| title | No | Report title shown in the header (default: 'EvalView Report') | |
| notes | No | Optional note shown in the report header (e.g. 'after refactor PR #42') | |
| no_auto_open | No | Set to true to suppress auto-opening the browser (useful in CI). Default: false. |
Implementation Reference
- The `generate_visual_report` function acts as the handler for generating the visual report, orchestrating data collection (KPIs, trace diffs, timeline) and rendering the HTML file.
def generate_visual_report( results: List["EvaluationResult"], diffs: Optional[List["TraceDiff"]] = None, output_path: Optional[str] = None, auto_open: bool = True, title: str = "EvalView Report", notes: Optional[str] = None, compare_results: Optional[List[List["EvaluationResult"]]] = None, compare_labels: Optional[List[str]] = None, golden_traces: Optional[Dict[str, Any]] = None, judge_usage: Optional[Dict[str, Any]] = None, default_tab: Optional[str] = None, ) -> str: """Generate a self-contained visual HTML report. Args: results: List of EvaluationResult objects. diffs: Optional list of TraceDiff objects for diff tab. output_path: Where to write the HTML (default: .evalview/reports/<timestamp>.html). auto_open: If True, open the report in the default browser. title: Report title shown in the header. notes: Optional free-text note shown in the header. golden_traces: Optional dict mapping test name to GoldenTrace. When provided, the Diffs tab renders side-by-side baseline vs. current Mermaid diagrams. Returns: Absolute path to the generated HTML file. """ if output_path is None: os.makedirs(".evalview/reports", exist_ok=True) ts = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f".evalview/reports/{ts}.html" kpis = _kpis(results) baseline = _baseline_meta(golden_traces) traces = [] for r in results: try: cost = r.trace.metrics.total_cost or 0.0 latency = r.trace.metrics.total_latency or 0.0 tokens = None input_tokens = 0 output_tokens = 0 if r.trace.metrics.total_tokens: input_tokens = r.trace.metrics.total_tokens.input_tokens output_tokens = r.trace.metrics.total_tokens.output_tokens tokens = input_tokens + output_tokens except AttributeError: cost, latency, tokens = 0.0, 0.0, None input_tokens, output_tokens = 0, 0 has_steps = bool(getattr(r.trace, "steps", None)) models = _extract_models(r) baseline_created = "" baseline_model = "Unknown" if golden_traces and r.test_case in golden_traces: metadata = getattr(golden_traces[r.test_case], "metadata", None) if metadata: blessed_at = getattr(metadata, "blessed_at", None) if isinstance(blessed_at, datetime): baseline_created = blessed_at.strftime("%Y-%m-%d %H:%M") model_id = getattr(metadata, "model_id", None) model_provider = getattr(metadata, "model_provider", None) if model_id: baseline_model = f"{model_provider}/{model_id}" if model_provider else str(model_id) else: trace_model_id = getattr(getattr(golden_traces[r.test_case], "trace", None), "model_id", None) trace_model_provider = getattr(getattr(golden_traces[r.test_case], "trace", None), "model_provider", None) if trace_model_id: baseline_model = f"{trace_model_provider}/{trace_model_id}" if trace_model_provider else str(trace_model_id) else: baseline_model = "Not recorded in snapshot" # Extract turn and tool info for the trace list view turn_list = [] if getattr(r.trace, "turns", None): for turn in getattr(r.trace, "turns", []) or []: turn_entry = { "index": int(getattr(turn, "index", 0) or 0), "query": str(getattr(turn, "query", "") or ""), "output": _strip_markdown(str(getattr(turn, "output", "") or "")), "tools": [str(tool) for tool in (getattr(turn, "tools", None) or [])], "latency_ms": float(getattr(turn, "latency_ms", 0) or 0), "cost": float(getattr(turn, "cost", 0) or 0), } # Attach per-turn evaluation if present eval_obj = getattr(turn, "evaluation", None) if eval_obj is not None: turn_entry["evaluation"] = { "passed": eval_obj.passed, "tool_accuracy": eval_obj.tool_accuracy, "forbidden_violations": eval_obj.forbidden_violations, "contains_passed": eval_obj.contains_passed, "contains_failed": eval_obj.contains_failed, "not_contains_passed": eval_obj.not_contains_passed, "not_contains_failed": eval_obj.not_contains_failed, } turn_list.append(turn_entry) elif has_steps: current_t_idx = None current_turn_data = None turn_fallback_latency = 0.0 turn_fallback_cost = 0.0 if not any(getattr(step, "turn_index", None) is not None for step in r.trace.steps): turn_fallback_latency = float(getattr(r.trace.metrics, "total_latency", 0) or 0) turn_fallback_cost = float(getattr(r.trace.metrics, "total_cost", 0) or 0) for step in r.trace.steps: t_idx = getattr(step, "turn_index", None) if t_idx is not None: if t_idx != current_t_idx: current_t_idx = t_idx current_turn_data = { "index": t_idx, "query": getattr(step, "turn_query", ""), "output": "", "tools": [], "latency_ms": 0.0, "cost": 0.0, } turn_list.append(current_turn_data) if current_turn_data is not None: tool_name = str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown") current_turn_data["tools"].append(tool_name) step_latency = float(getattr(getattr(step, "metrics", None), "latency", 0) or 0) step_cost = float(getattr(getattr(step, "metrics", None), "cost", 0) or 0) current_turn_data["latency_ms"] += step_latency current_turn_data["cost"] += step_cost if not turn_list and has_steps: turn_list.append({ "index": 1, "query": getattr(r, "input_query", "") or "", "output": _strip_markdown(getattr(r, "actual_output", "") or ""), "tools": [ str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown") for step in r.trace.steps ], "latency_ms": turn_fallback_latency, "cost": turn_fallback_cost, }) # Build failure reasons list for failed tests failure_reasons = [] if not r.passed: if r.min_score and r.score < r.min_score: failure_reasons.append(f"Score {round(r.score, 1)} below minimum {round(r.min_score, 1)}") evals = r.evaluations if evals.output_quality.score < 50: failure_reasons.append(f"Output quality: {round(evals.output_quality.score, 1)}/100") if evals.hallucination and getattr(evals.hallucination, "has_hallucination", False): conf = getattr(evals.hallucination, "confidence", None) conf_str = f" ({round(conf * 100)}% confidence)" if conf else "" failure_reasons.append(f"Hallucination detected{conf_str}") if evals.safety and not getattr(evals.safety, "is_safe", True): failure_reasons.append("Safety violation") if evals.forbidden_tools and getattr(evals.forbidden_tools, "violations", []): failure_reasons.append(f"Forbidden tools used: {', '.join(evals.forbidden_tools.violations)}") if evals.tool_accuracy.accuracy < 0.5: failure_reasons.append(f"Tool accuracy: {round(evals.tool_accuracy.accuracy * 100, 1)}%") output_rationale = getattr(r.evaluations.output_quality, "rationale", "") or "" # Score breakdown: show how the final score was calculated evals = r.evaluations tool_acc = round(evals.tool_accuracy.accuracy * 100, 1) if evals.tool_accuracy else None output_qual = round(evals.output_quality.score, 1) if evals.output_quality else None seq_obj = getattr(evals, "sequence_correctness", None) seq_correct = getattr(seq_obj, "correct", None) if seq_obj else None weights = getattr(r, "weights", None) or {} w_tool = weights.get("tool_accuracy", 0.3) w_output = weights.get("output_quality", 0.5) w_seq = weights.get("sequence_correctness", 0.2) traces.append({ "name": r.test_case, "diagram": _mermaid_trace(r) if has_steps else "", "has_steps": has_steps, "passed": r.passed, "cost": f"${cost:.6f}".rstrip('0').rstrip('.') if cost else "$0", "latency": f"{int(latency)}ms", "tokens": f"{tokens:,} tokens" if tokens else "", "input_tokens": input_tokens, "output_tokens": output_tokens, "score": round(r.score, 1), "tool_accuracy": tool_acc, "output_quality": output_qual, "sequence_correct": seq_correct, "w_tool": round(w_tool * 100), "w_output": round(w_output * 100), "w_seq": round(w_seq * 100), "model": ", ".join(models) if models else "Unknown", "baseline_created": baseline_created or "Unknown", "baseline_model": baseline_model, "query": getattr(r, "input_query", "") or "", "output": _strip_markdown(getattr(r, "actual_output", "") or ""), "turns": turn_list, "hallucination": _extract_check_result(r, "hallucination"), "safety": _extract_check_result(r, "safety"), "pii": _extract_check_result(r, "pii"), "forbidden_tools": _extract_check_result(r, "forbidden_tools"), "failure_reasons": failure_reasons, "output_rationale": output_rationale, }) actual_results_dict = {r.test_case: r for r in results} diff_rows = _diff_rows(diffs or [], golden_traces, actual_results_dict) timeline = _timeline_data(results) # Build comparison data if multiple runs provided compare_data = None if compare_results: labels = compare_labels or [] all_runs = [results] + list(compare_results) all_labels = labels if labels else [f"Run {i+1}" for i in range(len(all_runs))] compare_data = { "labels": all_labels, "runs": [_kpis(r) for r in all_runs], } html = _render_template( title=title, notes=notes or "", generated_at=datetime.now().strftime("%Y-%m-%d %H:%M"), kpis=kpis, baseline=baseline, judge_usage=judge_usage or {}, traces=traces, diff_rows=diff_rows, timeline=timeline, compare=compare_data, default_tab=default_tab or "overview", ) abs_path = os.path.abspath(output_path) with open(abs_path, "w", encoding="utf-8") as f: f.write(html) if auto_open: webbrowser.open(f"file://{abs_path}") return abs_path - Helper functions within `evalview/visualization/generators.py` handle data extraction, mermaid diagram generation, and metric calculation for the visual report.
from __future__ import annotations import json import os import webbrowser from collections import Counter from datetime import datetime from typing import Any, Dict, List, Optional, TYPE_CHECKING if TYPE_CHECKING: from evalview.core.types import EvaluationResult from evalview.core.diff import TraceDiff # ── Mermaid helpers ──────────────────────────────────────────────────────────── def _mermaid_from_steps(steps: List[Any], query: str = "", output: str = "") -> str: """Core Mermaid sequence diagram builder from a steps list.""" if not steps: return "sequenceDiagram\n Note over Agent: Direct response — no tools used" lines = ["sequenceDiagram"] lines.append(" participant User") lines.append(" participant Agent") seen_tools: Dict[str, str] = {} for step in steps: tool: str = str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown") if tool not in seen_tools: alias = f"T{len(seen_tools)}" seen_tools[tool] = alias short = (tool[:31] + "…") if len(tool) > 32 else tool lines.append(f" participant {alias} as {short}") short_query = _safe_mermaid((query[:40] + "…") if len(query) > 40 else query) if query else "..." lines.append(f" User->>Agent: {short_query}") current_turn = None for step in steps: step_turn = getattr(step, "turn_index", None) # Add a turn separator when the turn index changes if step_turn is not None and step_turn != current_turn: step_query = getattr(step, "turn_query", "") or "" safe_query = _safe_mermaid((step_query[:57] + "...") if len(step_query) > 60 else step_query) if safe_query: lines.append(f" Note over User,Agent: Turn {step_turn} - {safe_query}") else: lines.append(f" Note over User,Agent: Turn {step_turn}") current_turn = step_turn tool = str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown") alias = seen_tools.get(tool, tool) params = getattr(step, "parameters", {}) or {} param_str = ", ".join(f"{k}={str(v)[:20]}" for k, v in list(params.items())[:2]) if len(params) > 2: param_str += "…" success = getattr(step, "success", True) arrow = "->>" if success else "-x" lines.append(f" Agent{arrow}{alias}: {_safe_mermaid(param_str or tool)}") out = getattr(step, "output", None) out_str = str(out)[:30] if out is not None else "ok" lines.append(f" {alias}-->Agent: {_safe_mermaid(out_str)}") short_out = _safe_mermaid((output[:40] + "…") if len(output) > 40 else output) if output else "..." lines.append(f" Agent-->>User: {short_out}") return "\n".join(lines) def _mermaid_trace(result: "EvaluationResult") -> str: """Convert an EvaluationResult into a Mermaid sequence diagram.""" steps = [] try: steps = result.trace.steps or [] except AttributeError: pass query: str = str(getattr(result, "input_query", "") or "") output: str = str(getattr(result, "actual_output", "") or "") return _mermaid_from_steps(steps, query, output) def _strip_markdown(text: str) -> str: """Remove common markdown symbols for clean display in HTML.""" import re text = re.sub(r'\*{1,3}(.+?)\*{1,3}', r'\1', text, flags=re.DOTALL) # bold/italic text = re.sub(r'`(.+?)`', r'\1', text, flags=re.DOTALL) # inline code text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE) # headings return text def _safe_mermaid(s: str) -> str: """Strip everything except safe alphanumeric + basic punctuation for Mermaid labels.""" import re s = s.replace("\n", " ").replace("\r", "") s = re.sub(r'[^\w\s\.\-_/=:,]', '', s) s = s[:28].strip() return (s + '...') if len(s) == 28 else s or '...' # ── KPI helpers ──────────────────────────────────────────────────────────────── def _kpis(results: List["EvaluationResult"]) -> Dict[str, Any]: if not results: return {} total = len(results) passed = sum(1 for r in results if r.passed) scores = [r.score for r in results] costs = [] latencies = [] total_input_tokens = 0 total_output_tokens = 0 for r in results: try: costs.append(r.trace.metrics.total_cost or 0) latencies.append(r.trace.metrics.total_latency or 0) if r.trace.metrics.total_tokens: total_input_tokens += r.trace.metrics.total_tokens.input_tokens total_output_tokens += r.trace.metrics.total_tokens.output_tokens except AttributeError: pass models = _collect_models(results) total_tokens = total_input_tokens + total_output_tokens return { "total": total, "passed": passed, "failed": total - passed, "pass_rate": round(passed / total * 100, 1), "avg_score": round(sum(scores) / len(scores), 1), "total_cost": round(sum(costs), 6), "avg_latency_ms": round(sum(latencies) / len(latencies), 0) if latencies else 0, "scores": scores, "test_names": [r.test_case for r in results], "models": models, "models_display": ", ".join(models) if models else "Unknown", "total_input_tokens": total_input_tokens, "total_output_tokens": total_output_tokens, "total_tokens": total_tokens, } def _clean_model_name(model_id: str, provider: Optional[str] = None) -> str: """Format a model name for display — human-readable, no internal prefixes.""" # Skip transport-layer "providers" that aren't real LLM providers non_providers = {"http", "mcp", "unknown", "none", ""} if provider and provider.lower() not in non_providers: return f"{provider}/{model_id}" return model_id def _extract_models(result: "EvaluationResult") -> List[str]: """Extract best-effort model labels from a result (deduplicated by model ID).""" seen_ids: set[str] = set() labels: list[str] = [] trace = result.trace model_id = getattr(trace, "model_id", None) model_provider = getattr(trace, "model_provider", None) if model_id: seen_ids.add(model_id) labels.append(_clean_model_name(model_id, model_provider)) # Only add span models if the trace didn't already report a model_id. # When model_id is set (from the agent response), span models are # typically just the config echo from the HTTP adapter — showing both # creates confusing duplicates like "anthropic/claude-sonnet-4-5, claude-sonnet-4-6". trace_context = getattr(trace, "trace_context", None) if trace_context and not model_id: for span in trace_context.spans: if span.llm and span.llm.model and span.llm.model not in seen_ids: seen_ids.add(span.llm.model) provider = span.llm.provider or model_provider labels.append(_clean_model_name(span.llm.model, provider)) return labels def _extract_check_result(result: "EvaluationResult", check_name: str) -> Optional[Dict[str, Any]]: """Extract a check result (hallucination, safety, pii, forbidden_tools) for the template.""" evals = getattr(result, "evaluations", None) if not evals: return None check = getattr(evals, check_name, None) if check is None: return None data: Dict[str, Any] = {"passed": getattr(check, "passed", True)} if check_name == "hallucination": data["has_hallucination"] = getattr(check, "has_hallucination", False) data["confidence"] = getattr(check, "confidence", 0) data["details"] = getattr(check, "details", "") elif check_name == "safety": data["is_safe"] = getattr(check, "is_safe", True) data["categories"] = getattr(check, "categories_flagged", []) data["severity"] = getattr(check, "severity", "safe") data["details"] = getattr(check, "details", "") elif check_name == "pii": data["has_pii"] = getattr(check, "has_pii", False) data["types"] = getattr(check, "types_detected", []) data["details"] = getattr(check, "details", "") elif check_name == "forbidden_tools": data["violations"] = getattr(check, "violations", []) return data def _collect_models(results: List["EvaluationResult"]) -> List[str]: """Collect model labels across a run, ordered by frequency.""" counts: Counter[str] = Counter() for result in results: for label in _extract_models(result): counts[label] += 1 return [label for label, _ in counts.most_common()] def _baseline_meta(golden_traces: Optional[Dict[str, Any]]) -> Dict[str, Any]: """Summarize baseline creation metadata.""" if not golden_traces: return { "latest_created_display": "Unknown", "models_display": "Unknown", } blessed_times: list[datetime] = [] model_counts: Counter[str] = Counter() for golden in golden_traces.values(): metadata = getattr(golden, "metadata", None) if not metadata: continue blessed_at = getattr(metadata, "blessed_at", None) if isinstance(blessed_at, datetime): blessed_times.append(blessed_at) model_id = getattr(metadata, "model_id", None) model_provider = getattr(metadata, "model_provider", None) if model_id: model_counts[f"{model_provider}/{model_id}" if model_provider else str(model_id)] += 1 latest_created = max(blessed_times).strftime("%Y-%m-%d %H:%M") if blessed_times else "Unknown" models = [label for label, _ in model_counts.most_common()] return { "latest_created_display": latest_created, "models_display": ", ".join(models) if models else "Not recorded in snapshot", } # ── Diff helpers ─────────────────────────────────────────────────────────────── def _diff_rows( diffs: List["TraceDiff"], golden_traces: Optional[Dict[str, Any]] = None, actual_results: Optional[Dict[str, Any]] = None, ) -> List[Dict[str, Any]]: rows = [] for d in diffs: status = str(getattr(d, "overall_severity", "passed")).lower().replace("diffstatus.", "") output_diff = getattr(d, "output_diff", None) similarity = round(getattr(output_diff, "similarity", 1.0) * 100, 1) if output_diff else 100.0 semantic_similarity = None if output_diff and getattr(output_diff, "semantic_similarity", None) is not None: semantic_similarity = round(output_diff.semantic_similarity * 100, 1) golden_out = getattr(output_diff, "golden_preview", "") if output_diff else "" actual_out = getattr(output_diff, "actual_preview", "") if output_diff else "" diff_lines = getattr(output_diff, "diff_lines", []) if output_diff else [] score_delta = getattr(d, "score_diff", 0.0) or 0.0 test_name = getattr(d, "test_name", "") # Extract tool sequences from golden trace and tool_diffs golden_tools: List[str] = [] actual_tools: List[str] = [] if golden_traces and test_name in golden_traces: gt = golden_traces[test_name] golden_tools = getattr(gt, "tool_sequence", []) or [] # Reconstruct actual tools from golden + diffs tool_diffs = getattr(d, "tool_diffs", []) or [] if actual_results and test_name in actual_results: try: result = actual_results[test_name] actual_tools = [ str(getattr(s, "tool_name", None) or getattr(s, "step_name", "?")) for s in (result.trace.steps or []) ] except AttributeError: pass # Extract parameter diffs for the HTML template param_diffs = [] for td in tool_diffs: for pd in getattr(td, "parameter_diffs", []): sim = None if pd.similarity is not None: sim = round(pd.similarity * 100, 1) param_diffs.append({ "step": td.position + 1, "tool": td.golden_tool or td.actual_tool or "?", "param": pd.param_name, "golden": str(pd.golden_value)[:60] if pd.golden_value is not None else "", "actual": str(pd.actual_value)[:60] if pd.actual_value is not None else "", "type": pd.diff_type, "similarity": sim, }) # Generate side-by-side trajectory diagrams when trace data is available golden_diagram = "" actual_diagram = "" if golden_traces and test_name in golden_traces: gt = golden_traces[test_name] try: gt_steps = gt.trace.steps or [] except AttributeError: gt_steps = [] golden_diagram = _mermaid_from_steps(gt_steps) if actual_results and test_name in actual_results: actual_diagram = _mermaid_trace(actual_results[test_name]) actual_score = None if actual_results and test_name in actual_results: actual_score = round(getattr(actual_results[test_name], "score", 0), 1) baseline_score = round(actual_score - score_delta, 1) if actual_score is not None else None rows.append({ "name": test_name, "status": status, "score_delta": round(score_delta, 1), "actual_score": actual_score, "baseline_score": baseline_score, "similarity": similarity, "semantic_similarity": semantic_similarity, "golden_tools": golden_tools, "actual_tools": actual_tools, "golden_out": golden_out[:600], "actual_out": actual_out[:600], "diff_lines": diff_lines[:50], "param_diffs": param_diffs, "golden_diagram": golden_diagram, "actual_diagram": actual_diagram, }) return rows # ── Timeline helpers ─────────────────────────────────────────────────────────── def _timeline_data(results: List["EvaluationResult"]) -> List[Dict[str, Any]]: rows = [] for r in results: try: steps = r.trace.steps or [] fallback_latency = 0.0 fallback_cost = 0.0 if steps: total_latency = float(getattr(r.trace.metrics, "total_latency", 0) or 0) total_cost = float(getattr(r.trace.metrics, "total_cost", 0) or 0) if not any((getattr(getattr(step, "metrics", None), "latency", 0) or 0) > 0 for step in steps): fallback_latency = total_latency / len(steps) if total_latency > 0 else 0.0 if not any((getattr(getattr(step, "metrics", None), "cost", 0) or 0) > 0 for step in steps): fallback_cost = total_cost / len(steps) if total_cost > 0 else 0.0 for step in steps: lat = getattr(step.metrics, "latency", 0) if hasattr(step, "metrics") else 0 cost = getattr(step.metrics, "cost", 0) if hasattr(step, "metrics") else 0 if (not lat or lat <= 0) and fallback_latency: lat = fallback_latency if (not cost or cost <= 0) and fallback_cost: cost = fallback_cost tool = getattr(step, "tool_name", "unknown")[:20] test = r.test_case[:15] rows.append({ "test": test, "tool": tool, "label": f"{test} \u203a {tool}", "latency": round(lat, 1), "cost": round(cost, 6), "success": getattr(step, "success", True), }) except AttributeError: pass return rows # ── Main entry point ─────────────────────────────────────────────────────────── - evalview/mcp_server.py:462-463 (registration)The `generate_visual_report` tool is registered and called in `evalview/mcp_server.py`.
elif name == "generate_visual_report": return self._generate_visual_report(args)