Skip to main content
Glama
dashboard.js33.5 kB
let currentData = null; let benchmarkData = null; // File input handlers document.getElementById('jsonFile').addEventListener('change', (event) => { const file = event.target.files[0]; if (!file) return; const reader = new FileReader(); reader.onload = (e) => { try { const data = JSON.parse(e.target.result); currentData = data; renderDashboard(data); hideError(); } catch (error) { showError('Failed to parse JSON file: ' + error.message); } }; reader.readAsText(file); }); document.getElementById('benchmarkFile').addEventListener('change', (event) => { const file = event.target.files[0]; if (!file) return; const reader = new FileReader(); reader.onload = (e) => { try { benchmarkData = JSON.parse(e.target.result); console.log('Benchmark loaded:', benchmarkData.results.length, 'attempts'); if (currentData) { renderDashboard(currentData); } hideError(); } catch (error) { showError('Failed to parse benchmark JSON: ' + error.message); } }; reader.readAsText(file); }); function showError(message) { const errorEl = document.getElementById('errorMessage'); errorEl.textContent = message; errorEl.style.display = 'block'; } function hideError() { document.getElementById('errorMessage').style.display = 'none'; } function showWarning(message) { const warningEl = document.getElementById('warningMessage'); warningEl.innerHTML = message; warningEl.style.display = 'block'; } function hideWarning() { document.getElementById('warningMessage').style.display = 'none'; } function renderDashboard(data) { checkForNewTools(data); renderBenchmarkSummary(data); renderMetrics(data); renderTools(data); document.getElementById('metricsSection').style.display = 'block'; document.getElementById('toolsSection').style.display = 'block'; } function checkForNewTools(data) { if (!benchmarkData) { hideWarning(); return; } const currentToolIds = [...new Set(data.results.map(r => r.tool))]; const benchmarkToolIds = new Set(benchmarkData.results.map(r => r.tool)); const newTools = currentToolIds.filter(id => !benchmarkToolIds.has(id)); if (newTools.length > 0) { const toolList = newTools.map(id => `<code>${id}</code>`).join(', '); showWarning(`⚠️ <strong>New tools not in benchmark:</strong> ${toolList}`); } else { hideWarning(); } } function renderBenchmarkSummary(data) { const summaryEl = document.getElementById('benchmarkSummary'); if (!benchmarkData) { summaryEl.style.display = 'none'; return; } const currentToolIds = [...new Set(data.results.map(r => r.tool))]; const benchmarkToolIds = new Set(benchmarkData.results.map(r => r.tool)); const commonToolIds = currentToolIds.filter(id => benchmarkToolIds.has(id)); const totalBenchmarkTools = benchmarkToolIds.size; const totalBenchmarkAttempts = benchmarkData.results.length; const benchmarkLlm = `${benchmarkData.config.llmProvider} / ${benchmarkData.config.backendModel}`; const benchmarkAttemptsPerMode = benchmarkData.config.attemptsPerMode; summaryEl.innerHTML = ` <strong>Benchmark Comparison:</strong> Comparing your metrics against the same <strong>${commonToolIds.length}</strong> tools (There are ${totalBenchmarkTools} possible total in benchmark) | <strong>${totalBenchmarkAttempts}</strong> total attempts | <strong>${benchmarkAttemptsPerMode}</strong> attempts per mode | LLM: <strong>${benchmarkLlm}</strong> `; summaryEl.style.display = 'block'; } // Calculate metrics from results function calculateMetrics(results) { const oneShotAttempts = results.filter(r => !r.selfHealingEnabled); const selfHealingAttempts = results.filter(r => r.selfHealingEnabled); const toolsById = groupByTool(results); const totalTools = Object.keys(toolsById).length; const oneShotSuccessfulTools = Object.values(toolsById).filter(attempts => { return attempts.some(a => !a.selfHealingEnabled && a.overallValidationPassed === true); }).length; const oneShotRate = totalTools > 0 ? (oneShotSuccessfulTools / totalTools * 100) : null; const oneShotAverageSuccessRate = calculateAverageOneShotSuccess(toolsById); const selfHealingAverageSuccessRate = calculateAverageSelfHealingSuccess(toolsById); const selfHealingSuccessfulTools = Object.values(toolsById).filter(attempts => { const oneShotSuccess = attempts.some(a => !a.selfHealingEnabled && a.overallValidationPassed === true); const selfHealingSuccess = attempts.some(a => a.selfHealingEnabled && a.overallValidationPassed === true); return oneShotSuccess || selfHealingSuccess; }).length; const selfHealingRate = totalTools > 0 ? (selfHealingSuccessfulTools / totalTools * 100) : null; const oneShotSuccessfulAttempts = oneShotAttempts.filter(a => a.overallValidationPassed === true).length; const oneShotTotalAttempts = oneShotAttempts.length; const selfHealingSuccessfulAttempts = selfHealingAttempts.filter(a => a.overallValidationPassed === true).length; const selfHealingTotalAttempts = selfHealingAttempts.length; const buildTimes = results.filter(r => r.buildTime !== null).map(r => r.buildTime); const avgBuild = buildTimes.length > 0 ? (buildTimes.reduce((a, b) => a + b, 0) / buildTimes.length) : null; const oneShotExecTimes = oneShotAttempts.filter(r => r.executionTime !== null).map(r => r.executionTime); const avgOneShotExec = oneShotExecTimes.length > 0 ? (oneShotExecTimes.reduce((a, b) => a + b, 0) / oneShotExecTimes.length) : null; const selfHealingExecTimes = selfHealingAttempts.filter(r => r.executionTime !== null).map(r => r.executionTime); const avgSelfHealingExec = selfHealingExecTimes.length > 0 ? (selfHealingExecTimes.reduce((a, b) => a + b, 0) / selfHealingExecTimes.length) : null; return { oneShotRate, oneShotAverageSuccessRate, oneShotSuccessfulTools, oneShotSuccessfulAttempts, oneShotTotalAttempts, selfHealingRate, selfHealingAverageSuccessRate, selfHealingSuccessfulTools, selfHealingSuccessfulAttempts, selfHealingTotalAttempts, totalTools, avgBuild, avgOneShotExec, avgSelfHealingExec }; } function calculateAverageOneShotSuccess(toolsById) { const allTools = Object.values(toolsById); if (allTools.length === 0) return null; const successRates = allTools .map(attempts => { const oneShotAttempts = attempts.filter(a => !a.selfHealingEnabled); if (oneShotAttempts.length === 0) return null; const successCount = oneShotAttempts.filter(a => a.overallValidationPassed).length; return successCount / oneShotAttempts.length; }) .filter(rate => rate !== null); if (successRates.length === 0) return null; const sum = successRates.reduce((acc, rate) => acc + rate, 0); return sum / successRates.length; } function calculateAverageSelfHealingSuccess(toolsById) { const allTools = Object.values(toolsById); if (allTools.length === 0) return null; const successRates = allTools .map(attempts => { const selfHealingAttempts = attempts.filter(a => a.selfHealingEnabled); if (selfHealingAttempts.length === 0) return null; const successCount = selfHealingAttempts.filter(a => a.overallValidationPassed).length; return successCount / selfHealingAttempts.length; }) .filter(rate => rate !== null); if (successRates.length === 0) return null; const sum = successRates.reduce((acc, rate) => acc + rate, 0); return sum / successRates.length; } // Calculate and render metrics function renderMetrics(data) { const { config, results } = data; // Config info document.getElementById('llm').textContent = `${config.llmProvider} / ${config.backendModel}`; document.getElementById('validationLlm').textContent = `${config.validationLlmProvider} / ${config.validationLlmModel}`; document.getElementById('attemptsPerMode').textContent = config.attemptsPerMode; // Calculate current metrics const metrics = calculateMetrics(results); // Calculate benchmark metrics if available let benchmarkMetrics = null; if (benchmarkData) { const currentToolIds = [...new Set(results.map(r => r.tool))]; const filteredBenchmarkResults = benchmarkData.results.filter(r => currentToolIds.includes(r.tool)); benchmarkMetrics = calculateMetrics(filteredBenchmarkResults); } // Display with deltas - showing average as primary, "at least one" as secondary displaySuccessMetricWithAverage('oneShotSuccessRate', metrics.oneShotAverageSuccessRate, metrics.oneShotRate, benchmarkMetrics?.oneShotAverageSuccessRate, `${metrics.oneShotSuccessfulTools}/${metrics.totalTools}`, `${metrics.oneShotSuccessfulAttempts}/${metrics.oneShotTotalAttempts}`, true); displaySuccessMetricWithAverage('selfHealingSuccessRate', metrics.selfHealingAverageSuccessRate, metrics.selfHealingRate, benchmarkMetrics?.selfHealingAverageSuccessRate, `${metrics.selfHealingSuccessfulTools}/${metrics.totalTools}`, `${metrics.selfHealingSuccessfulAttempts}/${metrics.selfHealingTotalAttempts}`, true); displayMetricWithDelta('avgBuildTime', metrics.avgBuild, benchmarkMetrics?.avgBuild, 's', null, false, true); displayMetricWithDelta('avgExecOneShot', metrics.avgOneShotExec, benchmarkMetrics?.avgOneShotExec, 's', null, false, true); displayMetricWithDelta('avgExecSelfHealing', metrics.avgSelfHealingExec, benchmarkMetrics?.avgSelfHealingExec, 's', null, false, true); } function displaySuccessMetricWithAverage(elementId, averageRate, atLeastOneRate, benchmark, suffix, attemptsCount, higherIsBetter) { const element = document.getElementById(elementId); if (averageRate === null && atLeastOneRate === null) { element.innerHTML = 'N/A'; return; } const avgDisplay = averageRate !== null ? (averageRate * 100).toFixed(1) : 'N/A'; const atLeastOneDisplay = atLeastOneRate !== null ? atLeastOneRate.toFixed(1) : 'N/A'; const suffixText = suffix ? ` (${suffix})` : ''; const attemptsText = attemptsCount ? ` <span style="font-weight: normal;">(${attemptsCount})</span>` : ''; let html = ` <div style="font-weight: bold;">${avgDisplay}%${attemptsText}</div> <div style="font-size: 0.65em; color: #333; margin-top: 2px; font-weight: normal;">At least one: ${atLeastOneDisplay}%${suffixText}</div> `; if (benchmark !== null && benchmark !== undefined && averageRate !== null) { const benchmarkPercent = benchmark * 100; const averagePercent = averageRate * 100; const absoluteDiff = averagePercent - benchmarkPercent; const percentChange = ((averagePercent - benchmarkPercent) / benchmarkPercent) * 100; const diffSign = absoluteDiff > 0 ? '+' : ''; const deltaClass = getDeltaClass(absoluteDiff, higherIsBetter, false); const benchmarkDisplay = benchmarkPercent.toFixed(1); const deltaText = `${diffSign}${absoluteDiff.toFixed(1)} points vs ${benchmarkDisplay}% (${diffSign}${percentChange.toFixed(1)}% change)`; html += `<span class="metric-delta ${deltaClass}" style="margin-top: 4px; display: block;">${deltaText}</span>`; } element.innerHTML = html; } function displayMetricWithDelta(elementId, current, benchmark, unit, suffix, higherIsBetter, lowerIsBetter = false) { const element = document.getElementById(elementId); if (current === null) { element.innerHTML = 'N/A'; return; } const divisor = unit === 's' ? 1000 : 1; const displayValue = (current / divisor).toFixed(unit === '%' ? 1 : 2); const suffixText = suffix ? ` (${suffix})` : ''; if (benchmark === null || benchmark === undefined) { element.innerHTML = `${displayValue}${unit}${suffixText}`; return; } const benchmarkDisplay = (benchmark / divisor).toFixed(unit === '%' ? 1 : 2); const absoluteDiff = (current - benchmark) / divisor; const percentChange = ((current - benchmark) / benchmark) * 100; const diffSign = absoluteDiff > 0 ? '+' : ''; const deltaClass = getDeltaClass(absoluteDiff, higherIsBetter, lowerIsBetter); const pointLabel = unit === '%' ? 'points' : unit; const deltaText = unit === '%' ? `${diffSign}${absoluteDiff.toFixed(1)} ${pointLabel} vs ${benchmarkDisplay}${unit} (${diffSign}${percentChange.toFixed(1)}% change)` : `${diffSign}${absoluteDiff.toFixed(2)}${unit} vs ${benchmarkDisplay}${unit} (${diffSign}${percentChange.toFixed(1)}%)`; element.innerHTML = ` ${displayValue}${unit}${suffixText} <span class="metric-delta ${deltaClass}">${deltaText}</span> `; } function getDeltaClass(delta, higherIsBetter, lowerIsBetter) { if (Math.abs(delta) < 0.1) return 'delta-neutral'; if (higherIsBetter) { return delta > 0 ? 'delta-positive' : 'delta-negative'; } if (lowerIsBetter) { return delta < 0 ? 'delta-positive' : 'delta-negative'; } return 'delta-neutral'; } // Group and render tools function renderTools(data) { const { results } = data; const toolsById = groupByTool(results); const toolsList = document.getElementById('toolsList'); toolsList.innerHTML = ''; Object.entries(toolsById).forEach(([toolId, attempts]) => { const toolItem = createToolItem(toolId, attempts); toolsList.appendChild(toolItem); }); } function groupByTool(results) { const grouped = {}; results.forEach(result => { const toolId = result.tool; if (!grouped[toolId]) { grouped[toolId] = []; } grouped[toolId].push(result); }); return grouped; } function getToolStatusChange(toolId, oneShotAttempts, selfHealingAttempts) { if (!benchmarkData) return null; const benchmarkAttempts = benchmarkData.results.filter(r => r.tool === toolId); if (benchmarkAttempts.length === 0) return null; const changes = []; // Check one-shot mode if (oneShotAttempts.length > 0) { const benchmarkOneShot = benchmarkAttempts.filter(a => !a.selfHealingEnabled); if (benchmarkOneShot.length > 0) { const currentSuccessCount = oneShotAttempts.filter(a => a.overallValidationPassed === true).length; const currentTotal = oneShotAttempts.length; const benchmarkSuccessCount = benchmarkOneShot.filter(a => a.overallValidationPassed === true).length; const benchmarkTotal = benchmarkOneShot.length; if (currentSuccessCount < benchmarkSuccessCount) { changes.push({ type: 'regression', mode: 'one-shot', detail: `was ${benchmarkSuccessCount}/${benchmarkTotal}, now ${currentSuccessCount}/${currentTotal}` }); } else if (currentSuccessCount > benchmarkSuccessCount) { changes.push({ type: 'improvement', mode: 'one-shot', detail: `was ${benchmarkSuccessCount}/${benchmarkTotal}, now ${currentSuccessCount}/${currentTotal}` }); } } } // Check self-healing mode if (selfHealingAttempts.length > 0) { const benchmarkSelfHealing = benchmarkAttempts.filter(a => a.selfHealingEnabled); if (benchmarkSelfHealing.length > 0) { const currentSuccessCount = selfHealingAttempts.filter(a => a.overallValidationPassed === true).length; const currentTotal = selfHealingAttempts.length; const benchmarkSuccessCount = benchmarkSelfHealing.filter(a => a.overallValidationPassed === true).length; const benchmarkTotal = benchmarkSelfHealing.length; if (currentSuccessCount < benchmarkSuccessCount) { changes.push({ type: 'regression', mode: 'self-healing', detail: `was ${benchmarkSuccessCount}/${benchmarkTotal}, now ${currentSuccessCount}/${currentTotal}` }); } else if (currentSuccessCount > benchmarkSuccessCount) { changes.push({ type: 'improvement', mode: 'self-healing', detail: `was ${benchmarkSuccessCount}/${benchmarkTotal}, now ${currentSuccessCount}/${currentTotal}` }); } } } if (changes.length === 0) return null; const hasRegression = changes.some(c => c.type === 'regression'); return { type: hasRegression ? 'regression' : 'improvement', changes: changes }; } function createToolItem(toolId, attempts) { const container = document.createElement('div'); container.className = 'tool-item'; const oneShotAttempts = attempts.filter(a => !a.selfHealingEnabled); const selfHealingAttempts = attempts.filter(a => a.selfHealingEnabled); const toolName = attempts[0].toolName; // Check for status changes vs benchmark const statusChangeInfo = getToolStatusChange(toolId, oneShotAttempts, selfHealingAttempts); // Create header const header = document.createElement('div'); header.className = 'tool-header'; const statusChangeBadge = statusChangeInfo ? createStatusChangeBadge(statusChangeInfo) : ''; header.innerHTML = ` <div class="tool-info"> <div class="tool-name-row"> <div class="tool-name">${toolName}</div> ${statusChangeBadge} </div> <div class="tool-id">${toolId}</div> </div> <div class="status-indicators"> ${oneShotAttempts.length > 0 ? createModeStatusHTML('One-Shot', oneShotAttempts) : ''} ${createModeStatusHTML('Self-Healing', selfHealingAttempts)} <span class="expand-icon">▶</span> </div> `; // Create content (hidden by default) const content = document.createElement('div'); content.className = 'tool-content'; // Always show tabs for consistency const tabs = document.createElement('div'); tabs.className = 'mode-tabs'; tabs.innerHTML = ` <div class="tab active" data-mode="oneshot">One-Shot (${oneShotAttempts.length})</div> <div class="tab" data-mode="selfhealing">Self-Healing (${selfHealingAttempts.length})</div> `; content.appendChild(tabs); const oneShotContainer = document.createElement('div'); oneShotContainer.className = 'attempts-container'; oneShotContainer.dataset.mode = 'oneshot'; if (oneShotAttempts.length > 0) { oneShotAttempts.forEach((attempt, index) => { oneShotContainer.appendChild(createAttemptCard(attempt, 'One-Shot', index + 1)); }); } else { oneShotContainer.innerHTML = '<div class="no-attempts-message">No one-shot attempts</div>'; } content.appendChild(oneShotContainer); const selfHealingContainer = document.createElement('div'); selfHealingContainer.className = 'attempts-container'; selfHealingContainer.dataset.mode = 'selfhealing'; selfHealingContainer.style.display = 'none'; if (selfHealingAttempts.length > 0) { selfHealingAttempts.forEach((attempt, index) => { selfHealingContainer.appendChild(createAttemptCard(attempt, 'Self-Healing', index + 1)); }); } else { selfHealingContainer.innerHTML = '<div class="no-attempts-message">No self-healing attempts</div>'; } content.appendChild(selfHealingContainer); // Tab switching tabs.querySelectorAll('.tab').forEach(tab => { tab.addEventListener('click', () => { tabs.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); tab.classList.add('active'); const mode = tab.dataset.mode; content.querySelectorAll('.attempts-container').forEach(c => { c.style.display = c.dataset.mode === mode ? 'block' : 'none'; }); }); }); // Toggle expand/collapse header.addEventListener('click', () => { const isExpanded = header.classList.contains('expanded'); header.classList.toggle('expanded'); content.classList.toggle('expanded'); }); container.appendChild(header); container.appendChild(content); return container; } function createStatusChangeBadge(statusChangeInfo) { const { type, changes } = statusChangeInfo; // Create detailed text showing all changes const details = changes.map(change => { const modeLabel = change.mode === 'one-shot' ? 'One-Shot' : 'Self-Healing'; return `${modeLabel}: ${change.detail}`; }).join('; '); if (type === 'regression') { return `<span class="status-change-badge status-change-regression" title="${escapeHtml(details)}">⚠ Regression (${details})</span>`; } else if (type === 'improvement') { return `<span class="status-change-badge status-change-improvement" title="${escapeHtml(details)}">✓ Improvement (${details})</span>`; } return ''; } function createModeStatusHTML(modeName, attempts) { // If no attempts, show skipped if (attempts.length === 0) { return ` <div class="mode-status"> <div class="mode-label">${modeName}</div> <div class="status-badges"> <span class="badge badge-neutral">Skipped</span> </div> </div> `; } // Count successful attempts const successCount = attempts.filter(a => a.overallValidationPassed === true).length; const totalCount = attempts.length; // Determine badge class based on success ratio let badgeClass; if (successCount > 0) { badgeClass = 'badge-success'; } else { badgeClass = 'badge-failure'; } return ` <div class="mode-status"> <div class="mode-label">${modeName}</div> <div class="status-badges"> <span class="badge ${badgeClass}">${successCount}/${totalCount}</span> </div> </div> `; } function getFurthestAttempt(attempts) { // Order by progression: build -> execution -> validation const statusOrder = { 'build_failed': 1, 'execution_failed': 2, 'validation_failed_llm_failed': 3, 'validation_failed_llm_partial': 3, 'validation_skipped_llm_failed': 3, 'validation_skipped_llm_partial': 3, 'validation_passed': 4, 'validation_failed_llm_passed': 4, 'validation_skipped_llm_passed': 4 }; return attempts.reduce((furthest, current) => { const currentOrder = statusOrder[current.status] || 0; const furthestOrder = statusOrder[furthest.status] || 0; return currentOrder > furthestOrder ? current : furthest; }); } function getFailureStage(attempt) { if (!attempt.buildSuccess) { return { label: '✗ Build Failed', badgeClass: 'badge-failure' }; } if (!attempt.executionSuccess) { return { label: '✗ Execution Failed', badgeClass: 'badge-failure' }; } if (attempt.status.includes('partial')) { return { label: '~ Validation Partial', badgeClass: 'badge-partial' }; } return { label: '✗ Validation Failed', badgeClass: 'badge-failure' }; } function createStatusBadge(label, success) { if (success === null) { return `<span class="badge badge-neutral">${label}: -</span>`; } const badgeClass = success ? 'badge-success' : 'badge-failure'; const icon = success ? '✓' : '✗'; return `<span class="badge ${badgeClass}">${label}: ${icon}</span>`; } function createLLMBadge(judgment) { if (!judgment) { return `<span class="badge badge-neutral">LLM: -</span>`; } if (judgment === 'passes') { return `<span class="badge badge-success">LLM: ✓</span>`; } if (judgment === 'partial') { return `<span class="badge badge-partial">LLM: ~</span>`; } return `<span class="badge badge-failure">LLM: ✗</span>`; } function createAttemptCard(attempt, mode, attemptNumber) { const card = document.createElement('div'); card.className = 'attempt-card'; const attemptId = `attempt-${Math.random().toString(36).substr(2, 9)}`; const attemptHeader = document.createElement('div'); attemptHeader.className = 'attempt-card-header'; attemptHeader.innerHTML = ` <div class="attempt-title"> <span class="attempt-mode">${mode} - Attempt ${attemptNumber}</span> <div class="attempt-times"> <div class="time-item"> <span class="time-label">Build:</span> <span>${attempt.buildTime !== null ? (attempt.buildTime / 1000).toFixed(2) + 's' : 'N/A'}</span> </div> <div class="time-item"> <span class="time-label">Execution:</span> <span>${attempt.executionTime !== null ? (attempt.executionTime / 1000).toFixed(2) + 's' : 'N/A'}</span> </div> </div> </div> <div class="attempt-header-right"> <div class="badge ${getStatusBadgeClass(attempt.status)}">${formatStatus(attempt.status)}</div> <span class="expand-icon-small">▼</span> </div> `; const attemptBody = document.createElement('div'); attemptBody.className = 'attempt-body'; attemptBody.style.display = 'none'; let html = ` <div class="attempt-status"> <div class="status-item"> <div class="status-item-label">Build</div> <div class="status-item-value">${getStatusIcon(attempt.buildSuccess)} ${attempt.buildSuccess ? 'Success' : 'Failed'}</div> </div> <div class="status-item"> <div class="status-item-label">Execution</div> <div class="status-item-value">${getStatusIcon(attempt.executionSuccess)} ${attempt.executionSuccess ? 'Success' : 'Failed'}</div> </div> <div class="status-item"> <div class="status-item-label">Validation Func</div> <div class="status-item-value">${getStatusIcon(attempt.validationFunctionPassed)} ${formatValidationFuncStatus(attempt.validationFunctionPassed)}</div> </div> </div> `; // Add instruction if (attempt.instruction) { html += ` <div class="instruction-box"> <strong>Instruction:</strong> ${escapeHtml(attempt.instruction)} </div> `; } // Add description if present if (attempt.description) { html += ` <div class="description-box"> <strong>Expected Result:</strong> ${escapeHtml(attempt.description)} </div> `; } // Build error if (attempt.buildError) { html += ` <div class="error-section"> <div class="error-title">Build Error</div> <div class="error-content">${escapeHtml(attempt.buildError)}</div> </div> `; } // Execution error if (attempt.executionError) { html += ` <div class="error-section"> <div class="error-title">Execution Error</div> <div class="error-content">${escapeHtml(attempt.executionError)}</div> </div> `; } // Validation function error if (attempt.validationFunctionError) { html += ` <div class="error-section"> <div class="error-title">Validation Function Error</div> <div class="error-content">${escapeHtml(attempt.validationFunctionError)}</div> </div> `; } // LLM judgment if (attempt.llmJudgment) { html += ` <div class="llm-section"> <div class="llm-title">LLM Judge</div> <div class="llm-judgment"> <span class="badge ${getLLMBadgeClass(attempt.llmJudgment)}">${formatLLMJudgment(attempt.llmJudgment)}</span> </div> ${attempt.llmReason ? `<div class="llm-reason">"${escapeHtml(attempt.llmReason)}"</div>` : ''} </div> `; } // Output data if (attempt.data !== null) { const jsonString = JSON.stringify(attempt.data, null, 2); html += ` <div class="data-section"> <div class="data-header"> <div class="data-title">Output Data</div> <button class="copy-button" onclick="copyToClipboard('${attemptId}')">Copy JSON</button> </div> <div class="json-viewer"> <pre id="${attemptId}">${escapeHtml(jsonString)}</pre> </div> </div> `; } attemptBody.innerHTML = html; // Toggle collapse/expand attemptHeader.addEventListener('click', () => { const isExpanded = attemptBody.style.display !== 'none'; attemptBody.style.display = isExpanded ? 'none' : 'block'; attemptHeader.querySelector('.expand-icon-small').textContent = isExpanded ? '▼' : '▲'; }); card.appendChild(attemptHeader); card.appendChild(attemptBody); return card; } function getStatusBadgeClass(status) { if (!status) return 'badge-neutral'; if (status.includes('passed')) return 'badge-success'; if (status.includes('partial')) return 'badge-partial'; if (status.includes('failed')) return 'badge-failure'; return 'badge-neutral'; } function getLLMBadgeClass(judgment) { if (judgment === 'passes') return 'badge-success'; if (judgment === 'partial') return 'badge-partial'; return 'badge-failure'; } function formatStatus(status) { if (!status) return 'Unknown'; return status.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase()); } function formatLLMJudgment(judgment) { if (judgment === 'passes') return '✓ Passes'; if (judgment === 'partial') return '~ Partial'; return '✗ Failed'; } function getStatusIcon(success) { if (success === null) return '-'; return success ? '✓' : '✗'; } function formatBooleanStatus(value) { if (value === null) return 'N/A'; return value ? 'Passed' : 'Failed'; } function formatValidationFuncStatus(value) { if (value === null) return 'N/A'; return value ? 'Passed' : 'Failed'; } function escapeHtml(text) { const div = document.createElement('div'); div.textContent = text; return div.innerHTML; } function copyToClipboard(elementId) { const element = document.getElementById(elementId); const text = element.textContent; navigator.clipboard.writeText(text).then(() => { // Visual feedback const button = event.target; const originalText = button.textContent; button.textContent = 'Copied!'; button.style.backgroundColor = '#28a745'; setTimeout(() => { button.textContent = originalText; button.style.backgroundColor = '#FFA500'; }, 1500); }).catch(err => { console.error('Failed to copy:', err); }); } // Load benchmark JSON (only works when served via HTTP, not file://) async function loadBenchmarkJson() { try { const response = await fetch('../data/benchmark/tool-eval-benchmark.json'); if (!response.ok) { console.log('No benchmark file available - use file input to load manually'); return; } benchmarkData = await response.json(); console.log('Benchmark auto-loaded:', benchmarkData.results.length, 'attempts'); } catch (error) { console.log('Benchmark auto-load failed (expected with file:// protocol) - use file input to load manually'); } } // Auto-load latest result if available (optional) async function tryLoadLatestResult() { try { // Try to load a default file - user can adjust this path const response = await fetch('../data/results/tool-eval-latest.json'); if (response.ok) { const data = await response.json(); currentData = data; renderDashboard(data); } } catch (error) { // Silently fail - user will load file manually console.log('No default file available, please select a file'); } } // Load benchmark and try to auto-load on page load loadBenchmarkJson(); tryLoadLatestResult();

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/superglue-ai/superglue'

If you have feedback or need assistance with the MCP directory API, please join our Discord server