Skip to main content
Glama

generate_visual_report

Create visual HTML reports from evaluation results to analyze traces, diffs, scores, and timelines in your browser.

Instructions

Generate a beautiful self-contained HTML visual report from the latest evalview check or run results. Opens automatically in the browser. Call this after run_check or run_snapshot to give the user a visual breakdown of traces, diffs, scores, and timelines. Returns the absolute path to the generated HTML file.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
results_fileNoPath to a specific results JSON file. If omitted, uses the latest file in .evalview/results/.
titleNoReport title shown in the header (default: 'EvalView Report')
notesNoOptional note shown in the report header (e.g. 'after refactor PR #42')
no_auto_openNoSet to true to suppress auto-opening the browser (useful in CI). Default: false.

Implementation Reference

  • The `generate_visual_report` function acts as the handler for generating the visual report, orchestrating data collection (KPIs, trace diffs, timeline) and rendering the HTML file.
    def generate_visual_report(
        results: List["EvaluationResult"],
        diffs: Optional[List["TraceDiff"]] = None,
        output_path: Optional[str] = None,
        auto_open: bool = True,
        title: str = "EvalView Report",
        notes: Optional[str] = None,
        compare_results: Optional[List[List["EvaluationResult"]]] = None,
        compare_labels: Optional[List[str]] = None,
        golden_traces: Optional[Dict[str, Any]] = None,
        judge_usage: Optional[Dict[str, Any]] = None,
        default_tab: Optional[str] = None,
    ) -> str:
        """Generate a self-contained visual HTML report.
    
        Args:
            results: List of EvaluationResult objects.
            diffs: Optional list of TraceDiff objects for diff tab.
            output_path: Where to write the HTML (default: .evalview/reports/<timestamp>.html).
            auto_open: If True, open the report in the default browser.
            title: Report title shown in the header.
            notes: Optional free-text note shown in the header.
            golden_traces: Optional dict mapping test name to GoldenTrace. When provided,
                the Diffs tab renders side-by-side baseline vs. current Mermaid diagrams.
    
        Returns:
            Absolute path to the generated HTML file.
        """
        if output_path is None:
            os.makedirs(".evalview/reports", exist_ok=True)
            ts = datetime.now().strftime("%Y%m%d_%H%M%S")
            output_path = f".evalview/reports/{ts}.html"
    
        kpis = _kpis(results)
        baseline = _baseline_meta(golden_traces)
        traces = []
        for r in results:
            try:
                cost = r.trace.metrics.total_cost or 0.0
                latency = r.trace.metrics.total_latency or 0.0
                tokens = None
                input_tokens = 0
                output_tokens = 0
                if r.trace.metrics.total_tokens:
                    input_tokens = r.trace.metrics.total_tokens.input_tokens
                    output_tokens = r.trace.metrics.total_tokens.output_tokens
                    tokens = input_tokens + output_tokens
            except AttributeError:
                cost, latency, tokens = 0.0, 0.0, None
                input_tokens, output_tokens = 0, 0
            has_steps = bool(getattr(r.trace, "steps", None))
            models = _extract_models(r)
            baseline_created = ""
            baseline_model = "Unknown"
            if golden_traces and r.test_case in golden_traces:
                metadata = getattr(golden_traces[r.test_case], "metadata", None)
                if metadata:
                    blessed_at = getattr(metadata, "blessed_at", None)
                    if isinstance(blessed_at, datetime):
                        baseline_created = blessed_at.strftime("%Y-%m-%d %H:%M")
                    model_id = getattr(metadata, "model_id", None)
                    model_provider = getattr(metadata, "model_provider", None)
                    if model_id:
                        baseline_model = f"{model_provider}/{model_id}" if model_provider else str(model_id)
                    else:
                        trace_model_id = getattr(getattr(golden_traces[r.test_case], "trace", None), "model_id", None)
                        trace_model_provider = getattr(getattr(golden_traces[r.test_case], "trace", None), "model_provider", None)
                        if trace_model_id:
                            baseline_model = f"{trace_model_provider}/{trace_model_id}" if trace_model_provider else str(trace_model_id)
                        else:
                            baseline_model = "Not recorded in snapshot"
    
            # Extract turn and tool info for the trace list view
            turn_list = []
            if getattr(r.trace, "turns", None):
                for turn in getattr(r.trace, "turns", []) or []:
                    turn_entry = {
                        "index": int(getattr(turn, "index", 0) or 0),
                        "query": str(getattr(turn, "query", "") or ""),
                        "output": _strip_markdown(str(getattr(turn, "output", "") or "")),
                        "tools": [str(tool) for tool in (getattr(turn, "tools", None) or [])],
                        "latency_ms": float(getattr(turn, "latency_ms", 0) or 0),
                        "cost": float(getattr(turn, "cost", 0) or 0),
                    }
                    # Attach per-turn evaluation if present
                    eval_obj = getattr(turn, "evaluation", None)
                    if eval_obj is not None:
                        turn_entry["evaluation"] = {
                            "passed": eval_obj.passed,
                            "tool_accuracy": eval_obj.tool_accuracy,
                            "forbidden_violations": eval_obj.forbidden_violations,
                            "contains_passed": eval_obj.contains_passed,
                            "contains_failed": eval_obj.contains_failed,
                            "not_contains_passed": eval_obj.not_contains_passed,
                            "not_contains_failed": eval_obj.not_contains_failed,
                        }
                    turn_list.append(turn_entry)
            elif has_steps:
                current_t_idx = None
                current_turn_data = None
                turn_fallback_latency = 0.0
                turn_fallback_cost = 0.0
                if not any(getattr(step, "turn_index", None) is not None for step in r.trace.steps):
                    turn_fallback_latency = float(getattr(r.trace.metrics, "total_latency", 0) or 0)
                    turn_fallback_cost = float(getattr(r.trace.metrics, "total_cost", 0) or 0)
                for step in r.trace.steps:
                    t_idx = getattr(step, "turn_index", None)
                    if t_idx is not None:
                        if t_idx != current_t_idx:
                            current_t_idx = t_idx
                            current_turn_data = {
                                "index": t_idx,
                                "query": getattr(step, "turn_query", ""),
                                "output": "",
                                "tools": [],
                                "latency_ms": 0.0,
                                "cost": 0.0,
                            }
                            turn_list.append(current_turn_data)
    
                        if current_turn_data is not None:
                            tool_name = str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown")
                            current_turn_data["tools"].append(tool_name)
                            step_latency = float(getattr(getattr(step, "metrics", None), "latency", 0) or 0)
                            step_cost = float(getattr(getattr(step, "metrics", None), "cost", 0) or 0)
                            current_turn_data["latency_ms"] += step_latency
                            current_turn_data["cost"] += step_cost
    
                if not turn_list and has_steps:
                    turn_list.append({
                        "index": 1,
                        "query": getattr(r, "input_query", "") or "",
                        "output": _strip_markdown(getattr(r, "actual_output", "") or ""),
                        "tools": [
                            str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown")
                            for step in r.trace.steps
                        ],
                        "latency_ms": turn_fallback_latency,
                        "cost": turn_fallback_cost,
                    })
    
            # Build failure reasons list for failed tests
            failure_reasons = []
            if not r.passed:
                if r.min_score and r.score < r.min_score:
                    failure_reasons.append(f"Score {round(r.score, 1)} below minimum {round(r.min_score, 1)}")
                evals = r.evaluations
                if evals.output_quality.score < 50:
                    failure_reasons.append(f"Output quality: {round(evals.output_quality.score, 1)}/100")
                if evals.hallucination and getattr(evals.hallucination, "has_hallucination", False):
                    conf = getattr(evals.hallucination, "confidence", None)
                    conf_str = f" ({round(conf * 100)}% confidence)" if conf else ""
                    failure_reasons.append(f"Hallucination detected{conf_str}")
                if evals.safety and not getattr(evals.safety, "is_safe", True):
                    failure_reasons.append("Safety violation")
                if evals.forbidden_tools and getattr(evals.forbidden_tools, "violations", []):
                    failure_reasons.append(f"Forbidden tools used: {', '.join(evals.forbidden_tools.violations)}")
                if evals.tool_accuracy.accuracy < 0.5:
                    failure_reasons.append(f"Tool accuracy: {round(evals.tool_accuracy.accuracy * 100, 1)}%")
    
            output_rationale = getattr(r.evaluations.output_quality, "rationale", "") or ""
    
            # Score breakdown: show how the final score was calculated
            evals = r.evaluations
            tool_acc = round(evals.tool_accuracy.accuracy * 100, 1) if evals.tool_accuracy else None
            output_qual = round(evals.output_quality.score, 1) if evals.output_quality else None
            seq_obj = getattr(evals, "sequence_correctness", None)
            seq_correct = getattr(seq_obj, "correct", None) if seq_obj else None
            weights = getattr(r, "weights", None) or {}
            w_tool = weights.get("tool_accuracy", 0.3)
            w_output = weights.get("output_quality", 0.5)
            w_seq = weights.get("sequence_correctness", 0.2)
    
            traces.append({
                "name": r.test_case,
                "diagram": _mermaid_trace(r) if has_steps else "",
                "has_steps": has_steps,
                "passed": r.passed,
                "cost": f"${cost:.6f}".rstrip('0').rstrip('.') if cost else "$0",
                "latency": f"{int(latency)}ms",
                "tokens": f"{tokens:,} tokens" if tokens else "",
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "score": round(r.score, 1),
                "tool_accuracy": tool_acc,
                "output_quality": output_qual,
                "sequence_correct": seq_correct,
                "w_tool": round(w_tool * 100),
                "w_output": round(w_output * 100),
                "w_seq": round(w_seq * 100),
                "model": ", ".join(models) if models else "Unknown",
                "baseline_created": baseline_created or "Unknown",
                "baseline_model": baseline_model,
                "query": getattr(r, "input_query", "") or "",
                "output": _strip_markdown(getattr(r, "actual_output", "") or ""),
                "turns": turn_list,
                "hallucination": _extract_check_result(r, "hallucination"),
                "safety": _extract_check_result(r, "safety"),
                "pii": _extract_check_result(r, "pii"),
                "forbidden_tools": _extract_check_result(r, "forbidden_tools"),
                "failure_reasons": failure_reasons,
                "output_rationale": output_rationale,
            })
        actual_results_dict = {r.test_case: r for r in results}
        diff_rows = _diff_rows(diffs or [], golden_traces, actual_results_dict)
        timeline = _timeline_data(results)
    
        # Build comparison data if multiple runs provided
        compare_data = None
        if compare_results:
            labels = compare_labels or []
            all_runs = [results] + list(compare_results)
            all_labels = labels if labels else [f"Run {i+1}" for i in range(len(all_runs))]
            compare_data = {
                "labels": all_labels,
                "runs": [_kpis(r) for r in all_runs],
            }
    
        html = _render_template(
            title=title,
            notes=notes or "",
            generated_at=datetime.now().strftime("%Y-%m-%d %H:%M"),
            kpis=kpis,
            baseline=baseline,
            judge_usage=judge_usage or {},
            traces=traces,
            diff_rows=diff_rows,
            timeline=timeline,
            compare=compare_data,
            default_tab=default_tab or "overview",
        )
    
        abs_path = os.path.abspath(output_path)
        with open(abs_path, "w", encoding="utf-8") as f:
            f.write(html)
    
        if auto_open:
            webbrowser.open(f"file://{abs_path}")
    
        return abs_path
  • Helper functions within `evalview/visualization/generators.py` handle data extraction, mermaid diagram generation, and metric calculation for the visual report.
    from __future__ import annotations
    
    import json
    import os
    import webbrowser
    from collections import Counter
    from datetime import datetime
    from typing import Any, Dict, List, Optional, TYPE_CHECKING
    
    if TYPE_CHECKING:
        from evalview.core.types import EvaluationResult
        from evalview.core.diff import TraceDiff
    
    
    # ── Mermaid helpers ────────────────────────────────────────────────────────────
    
    def _mermaid_from_steps(steps: List[Any], query: str = "", output: str = "") -> str:
        """Core Mermaid sequence diagram builder from a steps list."""
        if not steps:
            return "sequenceDiagram\n    Note over Agent: Direct response — no tools used"
    
        lines = ["sequenceDiagram"]
        lines.append("    participant User")
        lines.append("    participant Agent")
    
        seen_tools: Dict[str, str] = {}
        for step in steps:
            tool: str = str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown")
            if tool not in seen_tools:
                alias = f"T{len(seen_tools)}"
                seen_tools[tool] = alias
                short = (tool[:31] + "…") if len(tool) > 32 else tool
                lines.append(f"    participant {alias} as {short}")
    
        short_query = _safe_mermaid((query[:40] + "…") if len(query) > 40 else query) if query else "..."
        lines.append(f"    User->>Agent: {short_query}")
    
        current_turn = None
    
        for step in steps:
            step_turn = getattr(step, "turn_index", None)
    
            # Add a turn separator when the turn index changes
            if step_turn is not None and step_turn != current_turn:
                step_query = getattr(step, "turn_query", "") or ""
                safe_query = _safe_mermaid((step_query[:57] + "...") if len(step_query) > 60 else step_query)
                if safe_query:
                    lines.append(f"    Note over User,Agent: Turn {step_turn} - {safe_query}")
                else:
                    lines.append(f"    Note over User,Agent: Turn {step_turn}")
                current_turn = step_turn
    
            tool = str(getattr(step, "tool_name", None) or getattr(step, "step_name", None) or "unknown")
            alias = seen_tools.get(tool, tool)
            params = getattr(step, "parameters", {}) or {}
            param_str = ", ".join(f"{k}={str(v)[:20]}" for k, v in list(params.items())[:2])
            if len(params) > 2:
                param_str += "…"
            success = getattr(step, "success", True)
            arrow = "->>" if success else "-x"
            lines.append(f"    Agent{arrow}{alias}: {_safe_mermaid(param_str or tool)}")
            out = getattr(step, "output", None)
            out_str = str(out)[:30] if out is not None else "ok"
            lines.append(f"    {alias}-->Agent: {_safe_mermaid(out_str)}")
    
        short_out = _safe_mermaid((output[:40] + "…") if len(output) > 40 else output) if output else "..."
        lines.append(f"    Agent-->>User: {short_out}")
    
        return "\n".join(lines)
    
    
    def _mermaid_trace(result: "EvaluationResult") -> str:
        """Convert an EvaluationResult into a Mermaid sequence diagram."""
        steps = []
        try:
            steps = result.trace.steps or []
        except AttributeError:
            pass
        query: str = str(getattr(result, "input_query", "") or "")
        output: str = str(getattr(result, "actual_output", "") or "")
        return _mermaid_from_steps(steps, query, output)
    
    
    def _strip_markdown(text: str) -> str:
        """Remove common markdown symbols for clean display in HTML."""
        import re
        text = re.sub(r'\*{1,3}(.+?)\*{1,3}', r'\1', text, flags=re.DOTALL)  # bold/italic
        text = re.sub(r'`(.+?)`', r'\1', text, flags=re.DOTALL)               # inline code
        text = re.sub(r'^#{1,6}\s+', '', text, flags=re.MULTILINE)            # headings
        return text
    
    
    def _safe_mermaid(s: str) -> str:
        """Strip everything except safe alphanumeric + basic punctuation for Mermaid labels."""
        import re
        s = s.replace("\n", " ").replace("\r", "")
        s = re.sub(r'[^\w\s\.\-_/=:,]', '', s)
        s = s[:28].strip()
        return (s + '...') if len(s) == 28 else s or '...'
    
    
    # ── KPI helpers ────────────────────────────────────────────────────────────────
    
    def _kpis(results: List["EvaluationResult"]) -> Dict[str, Any]:
        if not results:
            return {}
        total = len(results)
        passed = sum(1 for r in results if r.passed)
        scores = [r.score for r in results]
        costs = []
        latencies = []
        total_input_tokens = 0
        total_output_tokens = 0
        for r in results:
            try:
                costs.append(r.trace.metrics.total_cost or 0)
                latencies.append(r.trace.metrics.total_latency or 0)
                if r.trace.metrics.total_tokens:
                    total_input_tokens += r.trace.metrics.total_tokens.input_tokens
                    total_output_tokens += r.trace.metrics.total_tokens.output_tokens
            except AttributeError:
                pass
        models = _collect_models(results)
        total_tokens = total_input_tokens + total_output_tokens
        return {
            "total": total,
            "passed": passed,
            "failed": total - passed,
            "pass_rate": round(passed / total * 100, 1),
            "avg_score": round(sum(scores) / len(scores), 1),
            "total_cost": round(sum(costs), 6),
            "avg_latency_ms": round(sum(latencies) / len(latencies), 0) if latencies else 0,
            "scores": scores,
            "test_names": [r.test_case for r in results],
            "models": models,
            "models_display": ", ".join(models) if models else "Unknown",
            "total_input_tokens": total_input_tokens,
            "total_output_tokens": total_output_tokens,
            "total_tokens": total_tokens,
        }
    
    
    def _clean_model_name(model_id: str, provider: Optional[str] = None) -> str:
        """Format a model name for display — human-readable, no internal prefixes."""
        # Skip transport-layer "providers" that aren't real LLM providers
        non_providers = {"http", "mcp", "unknown", "none", ""}
        if provider and provider.lower() not in non_providers:
            return f"{provider}/{model_id}"
        return model_id
    
    
    def _extract_models(result: "EvaluationResult") -> List[str]:
        """Extract best-effort model labels from a result (deduplicated by model ID)."""
        seen_ids: set[str] = set()
        labels: list[str] = []
        trace = result.trace
        model_id = getattr(trace, "model_id", None)
        model_provider = getattr(trace, "model_provider", None)
        if model_id:
            seen_ids.add(model_id)
            labels.append(_clean_model_name(model_id, model_provider))
    
        # Only add span models if the trace didn't already report a model_id.
        # When model_id is set (from the agent response), span models are
        # typically just the config echo from the HTTP adapter — showing both
        # creates confusing duplicates like "anthropic/claude-sonnet-4-5, claude-sonnet-4-6".
        trace_context = getattr(trace, "trace_context", None)
        if trace_context and not model_id:
            for span in trace_context.spans:
                if span.llm and span.llm.model and span.llm.model not in seen_ids:
                    seen_ids.add(span.llm.model)
                    provider = span.llm.provider or model_provider
                    labels.append(_clean_model_name(span.llm.model, provider))
    
        return labels
    
    
    def _extract_check_result(result: "EvaluationResult", check_name: str) -> Optional[Dict[str, Any]]:
        """Extract a check result (hallucination, safety, pii, forbidden_tools) for the template."""
        evals = getattr(result, "evaluations", None)
        if not evals:
            return None
        check = getattr(evals, check_name, None)
        if check is None:
            return None
        data: Dict[str, Any] = {"passed": getattr(check, "passed", True)}
        if check_name == "hallucination":
            data["has_hallucination"] = getattr(check, "has_hallucination", False)
            data["confidence"] = getattr(check, "confidence", 0)
            data["details"] = getattr(check, "details", "")
        elif check_name == "safety":
            data["is_safe"] = getattr(check, "is_safe", True)
            data["categories"] = getattr(check, "categories_flagged", [])
            data["severity"] = getattr(check, "severity", "safe")
            data["details"] = getattr(check, "details", "")
        elif check_name == "pii":
            data["has_pii"] = getattr(check, "has_pii", False)
            data["types"] = getattr(check, "types_detected", [])
            data["details"] = getattr(check, "details", "")
        elif check_name == "forbidden_tools":
            data["violations"] = getattr(check, "violations", [])
        return data
    
    
    def _collect_models(results: List["EvaluationResult"]) -> List[str]:
        """Collect model labels across a run, ordered by frequency."""
        counts: Counter[str] = Counter()
        for result in results:
            for label in _extract_models(result):
                counts[label] += 1
        return [label for label, _ in counts.most_common()]
    
    
    def _baseline_meta(golden_traces: Optional[Dict[str, Any]]) -> Dict[str, Any]:
        """Summarize baseline creation metadata."""
        if not golden_traces:
            return {
                "latest_created_display": "Unknown",
                "models_display": "Unknown",
            }
    
        blessed_times: list[datetime] = []
        model_counts: Counter[str] = Counter()
        for golden in golden_traces.values():
            metadata = getattr(golden, "metadata", None)
            if not metadata:
                continue
            blessed_at = getattr(metadata, "blessed_at", None)
            if isinstance(blessed_at, datetime):
                blessed_times.append(blessed_at)
            model_id = getattr(metadata, "model_id", None)
            model_provider = getattr(metadata, "model_provider", None)
            if model_id:
                model_counts[f"{model_provider}/{model_id}" if model_provider else str(model_id)] += 1
    
        latest_created = max(blessed_times).strftime("%Y-%m-%d %H:%M") if blessed_times else "Unknown"
        models = [label for label, _ in model_counts.most_common()]
        return {
            "latest_created_display": latest_created,
            "models_display": ", ".join(models) if models else "Not recorded in snapshot",
        }
    
    
    # ── Diff helpers ───────────────────────────────────────────────────────────────
    
    def _diff_rows(
        diffs: List["TraceDiff"],
        golden_traces: Optional[Dict[str, Any]] = None,
        actual_results: Optional[Dict[str, Any]] = None,
    ) -> List[Dict[str, Any]]:
        rows = []
        for d in diffs:
            status = str(getattr(d, "overall_severity", "passed")).lower().replace("diffstatus.", "")
            output_diff = getattr(d, "output_diff", None)
            similarity = round(getattr(output_diff, "similarity", 1.0) * 100, 1) if output_diff else 100.0
            semantic_similarity = None
            if output_diff and getattr(output_diff, "semantic_similarity", None) is not None:
                semantic_similarity = round(output_diff.semantic_similarity * 100, 1)
            golden_out = getattr(output_diff, "golden_preview", "") if output_diff else ""
            actual_out = getattr(output_diff, "actual_preview", "") if output_diff else ""
            diff_lines = getattr(output_diff, "diff_lines", []) if output_diff else []
            score_delta = getattr(d, "score_diff", 0.0) or 0.0
            test_name = getattr(d, "test_name", "")
    
            # Extract tool sequences from golden trace and tool_diffs
            golden_tools: List[str] = []
            actual_tools: List[str] = []
            if golden_traces and test_name in golden_traces:
                gt = golden_traces[test_name]
                golden_tools = getattr(gt, "tool_sequence", []) or []
            # Reconstruct actual tools from golden + diffs
            tool_diffs = getattr(d, "tool_diffs", []) or []
            if actual_results and test_name in actual_results:
                try:
                    result = actual_results[test_name]
                    actual_tools = [
                        str(getattr(s, "tool_name", None) or getattr(s, "step_name", "?"))
                        for s in (result.trace.steps or [])
                    ]
                except AttributeError:
                    pass
    
            # Extract parameter diffs for the HTML template
            param_diffs = []
            for td in tool_diffs:
                for pd in getattr(td, "parameter_diffs", []):
                    sim = None
                    if pd.similarity is not None:
                        sim = round(pd.similarity * 100, 1)
                    param_diffs.append({
                        "step": td.position + 1,
                        "tool": td.golden_tool or td.actual_tool or "?",
                        "param": pd.param_name,
                        "golden": str(pd.golden_value)[:60] if pd.golden_value is not None else "",
                        "actual": str(pd.actual_value)[:60] if pd.actual_value is not None else "",
                        "type": pd.diff_type,
                        "similarity": sim,
                    })
    
            # Generate side-by-side trajectory diagrams when trace data is available
            golden_diagram = ""
            actual_diagram = ""
            if golden_traces and test_name in golden_traces:
                gt = golden_traces[test_name]
                try:
                    gt_steps = gt.trace.steps or []
                except AttributeError:
                    gt_steps = []
                golden_diagram = _mermaid_from_steps(gt_steps)
            if actual_results and test_name in actual_results:
                actual_diagram = _mermaid_trace(actual_results[test_name])
    
            actual_score = None
            if actual_results and test_name in actual_results:
                actual_score = round(getattr(actual_results[test_name], "score", 0), 1)
            baseline_score = round(actual_score - score_delta, 1) if actual_score is not None else None
    
            rows.append({
                "name": test_name,
                "status": status,
                "score_delta": round(score_delta, 1),
                "actual_score": actual_score,
                "baseline_score": baseline_score,
                "similarity": similarity,
                "semantic_similarity": semantic_similarity,
                "golden_tools": golden_tools,
                "actual_tools": actual_tools,
                "golden_out": golden_out[:600],
                "actual_out": actual_out[:600],
                "diff_lines": diff_lines[:50],
                "param_diffs": param_diffs,
                "golden_diagram": golden_diagram,
                "actual_diagram": actual_diagram,
            })
        return rows
    
    
    # ── Timeline helpers ───────────────────────────────────────────────────────────
    
    def _timeline_data(results: List["EvaluationResult"]) -> List[Dict[str, Any]]:
        rows = []
        for r in results:
            try:
                steps = r.trace.steps or []
                fallback_latency = 0.0
                fallback_cost = 0.0
                if steps:
                    total_latency = float(getattr(r.trace.metrics, "total_latency", 0) or 0)
                    total_cost = float(getattr(r.trace.metrics, "total_cost", 0) or 0)
                    if not any((getattr(getattr(step, "metrics", None), "latency", 0) or 0) > 0 for step in steps):
                        fallback_latency = total_latency / len(steps) if total_latency > 0 else 0.0
                    if not any((getattr(getattr(step, "metrics", None), "cost", 0) or 0) > 0 for step in steps):
                        fallback_cost = total_cost / len(steps) if total_cost > 0 else 0.0
                for step in steps:
                    lat = getattr(step.metrics, "latency", 0) if hasattr(step, "metrics") else 0
                    cost = getattr(step.metrics, "cost", 0) if hasattr(step, "metrics") else 0
                    if (not lat or lat <= 0) and fallback_latency:
                        lat = fallback_latency
                    if (not cost or cost <= 0) and fallback_cost:
                        cost = fallback_cost
                    tool = getattr(step, "tool_name", "unknown")[:20]
                    test = r.test_case[:15]
                    rows.append({
                        "test": test,
                        "tool": tool,
                        "label": f"{test} \u203a {tool}",
                        "latency": round(lat, 1),
                        "cost": round(cost, 6),
                        "success": getattr(step, "success", True),
                    })
            except AttributeError:
                pass
        return rows
    
    
    # ── Main entry point ───────────────────────────────────────────────────────────
  • The `generate_visual_report` tool is registered and called in `evalview/mcp_server.py`.
    elif name == "generate_visual_report":
        return self._generate_visual_report(args)

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/hidai25/eval-view'

If you have feedback or need assistance with the MCP directory API, please join our Discord server