evaluation.jsโข39.7 kB
/**
* WebSee MCP Server Evaluation Framework
*
* This module provides a comprehensive evaluation system for the WebSee MCP server
* following Anthropic's MCP builder standards for quality assurance.
*
* @module evaluation
*/
import { chromium } from 'playwright';
import * as fs from 'fs/promises';
import * as path from 'path';
// ============================================================================
// Test Case Definitions
// ============================================================================
export const TEST_CASES = [
// 1. Debugging React Component State Issues
{
id: 'eval-001',
category: 'Component Debugging',
description: 'Debug React component state issues by inspecting component props and state',
tool: 'inspect_component_state',
input: {
url: 'http://localhost:3000/app',
selector: '#user-profile',
waitForSelector: true,
includeChildren: true,
},
expectedOutput: {
fields: ['selector', 'component', 'children'],
conditions: [
{ field: 'component.name', type: 'exists' },
{ field: 'component.framework', type: 'exists' },
{ field: 'component.props', type: 'exists' },
{ field: 'component.state', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Component name is correctly identified',
points: 20,
validator: 'output.component && output.component.name && output.component.name.length > 0',
},
{
description: 'Framework is detected (React/Vue/Angular)',
points: 20,
validator: "output.component && ['react', 'vue', 'angular'].includes(output.component.framework?.toLowerCase())",
},
{
description: 'Props are extracted and non-empty',
points: 20,
validator: 'output.component && output.component.props && Object.keys(output.component.props).length > 0',
},
{
description: 'State is captured',
points: 20,
validator: 'output.component && output.component.state !== undefined',
},
{
description: 'Child components are included when requested',
points: 20,
validator: 'output.children && Array.isArray(output.children)',
},
],
},
performanceBenchmark: {
maxResponseTime: 5000,
expectedAccuracy: 90,
},
},
// 2. Analyzing Slow Network Requests
{
id: 'eval-002',
category: 'Network Analysis',
description: 'Identify and analyze slow network requests with timing information',
tool: 'analyze_performance',
input: {
url: 'http://localhost:3000/app',
metrics: ['network'],
},
expectedOutput: {
fields: ['url', 'timestamp', 'metrics'],
conditions: [
{ field: 'metrics.network', type: 'exists' },
{ field: 'metrics.network.totalRequests', type: 'greaterThan', value: 0 },
{ field: 'metrics.network.averageDuration', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Total request count is captured',
points: 15,
validator: 'output.metrics?.network?.totalRequests >= 0',
},
{
description: 'Slow requests are identified (>1000ms)',
points: 25,
validator: 'output.metrics?.network?.slowRequests !== undefined',
},
{
description: 'Average duration is calculated',
points: 20,
validator: 'output.metrics?.network?.averageDuration > 0',
},
{
description: 'Slowest requests are listed with details',
points: 25,
validator: 'Array.isArray(output.metrics?.network?.slowestRequests) && output.metrics.network.slowestRequests.every(r => r.url && r.duration)',
},
{
description: 'Stack trace shows what triggered the request',
points: 15,
validator: 'output.metrics?.network?.slowestRequests?.some(r => r.triggeredBy)',
},
],
},
performanceBenchmark: {
maxResponseTime: 8000,
expectedAccuracy: 95,
},
},
// 3. Resolving Minified Error Stack Traces
{
id: 'eval-003',
category: 'Error Resolution',
description: 'Resolve minified error stack traces to original source code locations',
tool: 'resolve_minified_error',
input: {
url: 'http://localhost:3000/app',
errorStack: "Error: Cannot read property 'name' of undefined\n at t.render (app.min.js:1:28473)",
triggerError: false,
},
expectedOutput: {
fields: ['resolved', 'original', 'sourceMap'],
conditions: [
{ field: 'resolved', type: 'equals', value: true },
{ field: 'sourceMap', type: 'exists' },
{ field: 'sourceMap', type: 'arrayLength', value: 1 },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Stack trace is successfully resolved',
points: 30,
validator: 'output.resolved === true',
},
{
description: 'Original source file is identified',
points: 25,
validator: "output.sourceMap && output.sourceMap.some(line => line.includes('.tsx') || line.includes('.ts') || line.includes('.jsx') || line.includes('.js'))",
},
{
description: 'Line and column numbers are provided',
points: 20,
validator: 'output.sourceMap && output.sourceMap.some(line => /:\\d+:\\d+/.test(line))',
},
{
description: 'Original error is preserved',
points: 15,
validator: 'output.original && output.original.length > 0',
},
{
description: 'Source map resolution message is clear',
points: 10,
validator: "output.message && output.message.includes('source map')",
},
],
},
performanceBenchmark: {
maxResponseTime: 6000,
expectedAccuracy: 85,
},
},
// 4. Finding Large Bundle Modules
{
id: 'eval-004',
category: 'Bundle Analysis',
description: 'Identify large modules in the JavaScript bundle that exceed size thresholds',
tool: 'analyze_bundle_size',
input: {
url: 'http://localhost:3000/app',
moduleName: 'lodash',
threshold: 50,
},
expectedOutput: {
fields: ['url', 'scripts', 'stylesheets', 'modules', 'recommendations'],
conditions: [
{ field: 'scripts.total', type: 'greaterThan', value: 0 },
{ field: 'scripts.files', type: 'exists' },
{ field: 'recommendations', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Total script count is captured',
points: 15,
validator: 'output.scripts && output.scripts.total >= 0',
},
{
description: 'Script files are listed with sources',
points: 20,
validator: 'Array.isArray(output.scripts?.files) && output.scripts.files.length > 0',
},
{
description: 'Module search works when specified',
points: 25,
validator: 'Array.isArray(output.modules) && (output.modules.length === 0 || output.modules[0].name)',
},
{
description: 'Size threshold recommendations are generated',
points: 25,
validator: 'Array.isArray(output.recommendations)',
},
{
description: 'Stylesheets are analyzed',
points: 15,
validator: 'output.stylesheets && output.stylesheets.total >= 0',
},
],
},
performanceBenchmark: {
maxResponseTime: 7000,
expectedAccuracy: 88,
},
},
// 5. Tracing User Interaction Flows
{
id: 'eval-005',
category: 'Interaction Tracing',
description: 'Trace network activity triggered by user interactions',
tool: 'trace_network_requests',
input: {
url: 'http://localhost:3000/app',
pattern: '/api/*',
method: 'GET',
waitTime: 3000,
},
expectedOutput: {
fields: ['url', 'pattern', 'method', 'totalRequests', 'requests'],
conditions: [
{ field: 'totalRequests', type: 'exists' },
{ field: 'requests', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'URL pattern filtering works correctly',
points: 25,
validator: "output.pattern && output.requests.every(r => !output.pattern || r.url.includes(output.pattern.replace('*', '')))",
},
{
description: 'HTTP method filtering is applied',
points: 20,
validator: "output.method && (output.method === 'ALL' || output.requests.every(r => r.method === output.method))",
},
{
description: 'Request details include URL, method, status',
points: 20,
validator: 'output.requests && output.requests.every(r => r.url && r.method && r.status !== undefined)',
},
{
description: 'Request timing information is captured',
points: 20,
validator: 'output.requests && output.requests.every(r => r.duration !== undefined && r.timestamp)',
},
{
description: 'Stack traces show request origin',
points: 15,
validator: 'output.requests && output.requests.some(r => r.triggeredBy)',
},
],
},
performanceBenchmark: {
maxResponseTime: 6000,
expectedAccuracy: 92,
},
},
// 6. Memory Leak Detection Scenarios
{
id: 'eval-006',
category: 'Memory Analysis',
description: 'Analyze memory usage and detect potential memory leaks',
tool: 'analyze_performance',
input: {
url: 'http://localhost:3000/app',
metrics: ['memory', 'components'],
},
expectedOutput: {
fields: ['metrics'],
conditions: [
{ field: 'metrics.memory', type: 'exists' },
{ field: 'metrics.components', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Memory metrics are captured',
points: 30,
validator: 'output.metrics?.memory !== undefined && output.metrics?.memory !== null',
},
{
description: 'Heap size information is provided',
points: 25,
validator: 'output.metrics?.memory && (output.metrics.memory.usedJSHeapSize || output.metrics.memory === null)',
},
{
description: 'Component count is tracked',
points: 20,
validator: 'output.metrics?.components?.totalComponents >= 0',
},
{
description: 'Component nesting depth is analyzed',
points: 15,
validator: 'output.metrics?.components?.deepestNesting >= 0',
},
{
description: 'Components grouped by framework',
points: 10,
validator: "output.metrics?.components?.byFramework && typeof output.metrics.components.byFramework === 'object'",
},
],
},
performanceBenchmark: {
maxResponseTime: 7000,
expectedAccuracy: 80,
},
},
// 7. Cross-Browser Compatibility Checks
{
id: 'eval-007',
category: 'Cross-Browser Testing',
description: 'Test frontend functionality across different browsers',
tool: 'debug_frontend_issue',
input: {
url: 'http://localhost:3000/app',
selector: '#main-content',
screenshot: false,
},
expectedOutput: {
fields: ['url', 'timestamp', 'issues', 'components', 'network', 'console'],
conditions: [
{ field: 'console', type: 'exists' },
{ field: 'network', type: 'exists' },
{ field: 'issues', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Console errors and warnings are captured',
points: 25,
validator: 'Array.isArray(output.console)',
},
{
description: 'Network requests are tracked',
points: 20,
validator: 'Array.isArray(output.network) && output.network.length >= 0',
},
{
description: 'Component information is extracted',
points: 20,
validator: 'Array.isArray(output.components)',
},
{
description: 'Issues are identified and categorized',
points: 20,
validator: 'Array.isArray(output.issues)',
},
{
description: 'Timestamp for debugging timeline',
points: 15,
validator: 'output.timestamp && new Date(output.timestamp).getTime() > 0',
},
],
},
performanceBenchmark: {
maxResponseTime: 8000,
expectedAccuracy: 90,
},
},
// 8. Performance Bottleneck Identification
{
id: 'eval-008',
category: 'Performance Optimization',
description: 'Identify performance bottlenecks in frontend applications',
tool: 'analyze_performance',
input: {
url: 'http://localhost:3000/app',
metrics: ['network', 'bundle', 'components'],
interactions: [{ action: 'scroll', selector: undefined, value: undefined }],
},
expectedOutput: {
fields: ['metrics'],
conditions: [
{ field: 'metrics.network', type: 'exists' },
{ field: 'metrics.bundle', type: 'exists' },
{ field: 'metrics.components', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Network performance metrics are comprehensive',
points: 30,
validator: 'output.metrics?.network && output.metrics.network.totalRequests >= 0 && output.metrics.network.averageDuration >= 0',
},
{
description: 'Bundle size analysis is performed',
points: 25,
validator: 'output.metrics?.bundle && output.metrics.bundle.totalScripts >= 0',
},
{
description: 'Largest scripts are identified',
points: 20,
validator: 'Array.isArray(output.metrics?.bundle?.largestScripts)',
},
{
description: 'Component metrics show optimization opportunities',
points: 15,
validator: 'output.metrics?.components?.totalComponents >= 0 && output.metrics.components.deepestNesting >= 0',
},
{
description: 'User interactions are properly executed',
points: 10,
validator: 'output.url && output.timestamp',
},
],
},
performanceBenchmark: {
maxResponseTime: 10000,
expectedAccuracy: 85,
},
},
// 9. Component Tree Analysis
{
id: 'eval-009',
category: 'Component Architecture',
description: 'Analyze component hierarchy and relationships',
tool: 'analyze_performance',
input: {
url: 'http://localhost:3000/app',
metrics: ['components'],
},
expectedOutput: {
fields: ['metrics.components'],
conditions: [
{ field: 'metrics.components.totalComponents', type: 'exists' },
{ field: 'metrics.components.byFramework', type: 'exists' },
{ field: 'metrics.components.deepestNesting', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Total component count is accurate',
points: 30,
validator: 'output.metrics?.components?.totalComponents >= 0',
},
{
description: 'Components are grouped by framework',
points: 25,
validator: "output.metrics?.components?.byFramework && typeof output.metrics.components.byFramework === 'object'",
},
{
description: 'Nesting depth is calculated',
points: 25,
validator: 'output.metrics?.components?.deepestNesting >= 0',
},
{
description: 'Framework detection is working',
points: 20,
validator: 'output.metrics?.components?.byFramework && Object.keys(output.metrics.components.byFramework).length >= 0',
},
],
},
performanceBenchmark: {
maxResponseTime: 6000,
expectedAccuracy: 88,
},
},
// 10. Build Optimization Recommendations
{
id: 'eval-010',
category: 'Build Optimization',
description: 'Generate actionable recommendations for build optimization',
tool: 'analyze_bundle_size',
input: {
url: 'http://localhost:3000/app',
threshold: 100,
},
expectedOutput: {
fields: ['recommendations'],
conditions: [
{ field: 'recommendations', type: 'exists' },
{ field: 'scripts', type: 'exists' },
],
},
scoring: {
maxPoints: 100,
criteria: [
{
description: 'Recommendations are generated',
points: 30,
validator: 'Array.isArray(output.recommendations)',
},
{
description: 'Recommendations are actionable and specific',
points: 25,
validator: "output.recommendations && output.recommendations.every(r => typeof r === 'string' && r.length > 20)",
},
{
description: 'Threshold-based warnings are included',
points: 20,
validator: "output.recommendations && output.recommendations.some(r => r.includes('KB') || r.includes('threshold'))",
},
{
description: 'Code splitting suggestions when appropriate',
points: 15,
validator: "output.recommendations && (output.recommendations.length === 0 || output.recommendations.some(r => r.toLowerCase().includes('split')))",
},
{
description: 'Total bundle size is calculated',
points: 10,
validator: 'output.scripts && output.scripts.totalSize >= 0',
},
],
},
performanceBenchmark: {
maxResponseTime: 7000,
expectedAccuracy: 85,
},
},
];
// ============================================================================
// Evaluation Engine
// ============================================================================
export class EvaluationEngine {
browser = null;
testCases;
results = [];
constructor(testCases = TEST_CASES) {
this.testCases = testCases;
}
/**
* Initialize the evaluation engine
*/
async initialize() {
this.browser = await chromium.launch({ headless: true });
console.log('โ
Evaluation engine initialized');
}
/**
* Run a single test case
*/
async runTestCase(testCase) {
const startTime = Date.now();
const result = {
testId: testCase.id,
passed: false,
score: 0,
maxScore: testCase.scoring.maxPoints,
responseTime: 0,
errors: [],
warnings: [],
output: null,
validationResults: [],
};
try {
// Simulate MCP tool call
// In real implementation, this would call the actual MCP server
result.output = await this.simulateToolCall(testCase.tool, testCase.input);
// Validate output structure
this.validateOutputStructure(testCase, result);
// Evaluate scoring criteria
this.evaluateScoringCriteria(testCase, result);
// Check performance benchmarks
result.responseTime = Date.now() - startTime;
if (result.responseTime > testCase.performanceBenchmark.maxResponseTime) {
result.warnings.push(`Response time ${result.responseTime}ms exceeds benchmark of ${testCase.performanceBenchmark.maxResponseTime}ms`);
}
// Determine pass/fail
const scorePercentage = (result.score / result.maxScore) * 100;
result.passed = scorePercentage >= testCase.performanceBenchmark.expectedAccuracy;
}
catch (error) {
result.errors.push(`Test execution failed: ${error instanceof Error ? error.message : String(error)}`);
result.passed = false;
}
return result;
}
/**
* Validate that output contains expected fields and meets conditions
*/
validateOutputStructure(testCase, result) {
const { expectedOutput } = testCase;
// Check required fields
for (const field of expectedOutput.fields) {
const value = this.getNestedValue(result.output, field);
if (value === undefined) {
result.warnings.push(`Expected field '${field}' not found in output`);
}
}
// Check conditions
for (const condition of expectedOutput.conditions) {
const value = this.getNestedValue(result.output, condition.field);
let conditionMet = false;
switch (condition.type) {
case 'exists':
conditionMet = value !== undefined && value !== null;
break;
case 'equals':
conditionMet = value === condition.value;
break;
case 'contains':
conditionMet = typeof value === 'string' && value.includes(String(condition.value));
break;
case 'greaterThan':
conditionMet = typeof value === 'number' && value > (condition.value || 0);
break;
case 'lessThan':
conditionMet = typeof value === 'number' && value < (condition.value || 0);
break;
case 'arrayLength':
conditionMet = Array.isArray(value) && value.length >= (condition.value || 0);
break;
}
if (!conditionMet) {
result.warnings.push(`Condition not met: ${condition.field} ${condition.type} ${condition.value !== undefined ? condition.value : ''}`);
}
}
}
/**
* Evaluate scoring criteria using validator expressions
*/
evaluateScoringCriteria(testCase, result) {
for (const criterion of testCase.scoring.criteria) {
let passed = false;
try {
// Create a safe evaluation context
const evalFunc = new Function('output', `return ${criterion.validator}`);
passed = Boolean(evalFunc(result.output));
}
catch (error) {
result.warnings.push(`Validator error for "${criterion.description}": ${error instanceof Error ? error.message : String(error)}`);
}
result.validationResults.push({
criterion: criterion.description,
passed,
points: passed ? criterion.points : 0,
});
if (passed) {
result.score += criterion.points;
}
}
}
/**
* Get nested value from object using dot notation
*/
getNestedValue(obj, path) {
return path.split('.').reduce((current, key) => current?.[key], obj);
}
/**
* Simulate MCP tool call (placeholder for actual implementation)
*/
async simulateToolCall(tool, input) {
// This is a placeholder that returns mock data
// In real implementation, this would call the actual MCP server
console.log(`[SIMULATION] Calling tool: ${tool} with input:`, JSON.stringify(input, null, 2));
// Return mock responses based on tool
switch (tool) {
case 'inspect_component_state':
return {
selector: input.selector,
component: {
name: 'UserProfile',
framework: 'react',
props: { userId: '123' },
state: { loading: false },
source: { file: 'src/components/UserProfile.tsx', line: 42 },
parents: ['App', 'Dashboard'],
},
children: input.includeChildren ? [{ name: 'Avatar', props: {} }] : undefined,
};
case 'analyze_performance':
return {
url: input.url,
timestamp: new Date().toISOString(),
metrics: this.getMockMetrics(input.metrics),
};
case 'resolve_minified_error':
return {
resolved: true,
original: input.errorStack,
sourceMap: [
"Error: Cannot read property 'name' of undefined",
' at UserProfile.render (src/components/UserProfile.tsx:87:15)',
],
message: 'Stack trace resolved using source maps',
};
case 'analyze_bundle_size':
return {
url: input.url,
scripts: {
total: 5,
totalSize: 524288,
files: [
{ src: 'main.js', size: 262144, async: false, defer: false },
{ src: 'vendor.js', size: 262144, async: false, defer: false },
],
},
stylesheets: {
total: 2,
files: [{ href: 'styles.css', media: 'all' }],
},
modules: input.moduleName ? [{ name: input.moduleName, found: false }] : [],
recommendations: [
'Found 2 script(s) larger than 100 KB. Consider code splitting for better performance.',
],
};
case 'trace_network_requests':
return {
url: input.url,
pattern: input.pattern,
method: input.method,
totalRequests: 3,
requests: [
{
url: '/api/users/123',
method: 'GET',
status: 200,
duration: 245,
size: 1024,
triggeredBy: { file: 'UserProfile.tsx', line: 42 },
timestamp: Date.now(),
},
],
};
case 'debug_frontend_issue':
return {
url: input.url,
timestamp: new Date().toISOString(),
issues: [],
components: input.selector
? [
{
selector: input.selector,
name: 'MainContent',
framework: 'react',
props: {},
state: {},
},
]
: [],
network: [{ url: '/api/data', method: 'GET', status: 200, duration: 150 }],
console: [],
};
default:
return {};
}
}
/**
* Get mock metrics based on requested types
*/
getMockMetrics(metricTypes) {
const metrics = {};
if (metricTypes.includes('network')) {
metrics.network = {
totalRequests: 15,
slowRequests: 2,
averageDuration: 342,
slowestRequests: [
{ url: '/api/data', duration: 1250, triggeredBy: { file: 'App.tsx', line: 23 } },
{ url: '/api/users', duration: 1100, triggeredBy: { file: 'UserList.tsx', line: 56 } },
],
};
}
if (metricTypes.includes('components')) {
metrics.components = {
totalComponents: 12,
byFramework: { react: 12 },
deepestNesting: 5,
};
}
if (metricTypes.includes('bundle')) {
metrics.bundle = {
totalScripts: 5,
totalSize: 524288,
largestScripts: [
{ src: 'vendor.js', size: 262144 },
{ src: 'main.js', size: 131072 },
],
};
}
if (metricTypes.includes('memory')) {
metrics.memory = {
usedJSHeapSize: '45 MB',
totalJSHeapSize: '60 MB',
limit: '2048 MB',
};
}
return metrics;
}
/**
* Run all test cases
*/
async runAllTests() {
console.log(`\n๐งช Running ${this.testCases.length} evaluation tests...\n`);
this.results = [];
for (const testCase of this.testCases) {
console.log(`Running: ${testCase.id} - ${testCase.description}`);
const result = await this.runTestCase(testCase);
this.results.push(result);
const status = result.passed ? 'โ
PASS' : 'โ FAIL';
console.log(` ${status} - Score: ${result.score}/${result.maxScore} (${result.responseTime}ms)\n`);
}
return this.generateReport();
}
/**
* Generate comprehensive evaluation report
*/
generateReport() {
const passedTests = this.results.filter(r => r.passed).length;
const totalScore = this.results.reduce((sum, r) => sum + r.score, 0);
const maxPossibleScore = this.results.reduce((sum, r) => sum + r.maxScore, 0);
const averageResponseTime = this.results.reduce((sum, r) => sum + r.responseTime, 0) / this.results.length;
// Category breakdown
const categoryBreakdown = {};
for (const result of this.results) {
const testCase = this.testCases.find(tc => tc.id === result.testId);
if (!categoryBreakdown[testCase.category]) {
categoryBreakdown[testCase.category] = {
passed: 0,
failed: 0,
score: 0,
maxScore: 0,
};
}
if (result.passed) {
categoryBreakdown[testCase.category].passed++;
}
else {
categoryBreakdown[testCase.category].failed++;
}
categoryBreakdown[testCase.category].score += result.score;
categoryBreakdown[testCase.category].maxScore += result.maxScore;
}
// Performance metrics by tool
const toolMetrics = {};
for (const result of this.results) {
const testCase = this.testCases.find(tc => tc.id === result.testId);
if (!toolMetrics[testCase.tool]) {
toolMetrics[testCase.tool] = { times: [], successes: 0, total: 0 };
}
toolMetrics[testCase.tool].times.push(result.responseTime);
toolMetrics[testCase.tool].total++;
if (result.passed) {
toolMetrics[testCase.tool].successes++;
}
}
const performanceMetrics = Object.entries(toolMetrics).map(([tool, data]) => ({
toolName: tool,
averageTime: data.times.reduce((a, b) => a + b, 0) / data.times.length,
minTime: Math.min(...data.times),
maxTime: Math.max(...data.times),
successRate: (data.successes / data.total) * 100,
}));
return {
timestamp: new Date().toISOString(),
totalTests: this.testCases.length,
passedTests,
failedTests: this.testCases.length - passedTests,
totalScore,
maxPossibleScore,
scorePercentage: (totalScore / maxPossibleScore) * 100,
averageResponseTime,
testResults: this.results,
categoryBreakdown,
performanceMetrics,
};
}
/**
* Print report to console
*/
printReport(report) {
console.log('\n' + '='.repeat(80));
console.log('WEBSEE MCP SERVER - EVALUATION REPORT');
console.log('='.repeat(80));
console.log(`\nTimestamp: ${report.timestamp}`);
console.log(`\nOverall Results:`);
console.log(` Total Tests: ${report.totalTests}`);
console.log(` Passed: ${report.passedTests} โ
`);
console.log(` Failed: ${report.failedTests} โ`);
console.log(` Score: ${report.totalScore}/${report.maxPossibleScore} (${report.scorePercentage.toFixed(2)}%)`);
console.log(` Average Response Time: ${report.averageResponseTime.toFixed(2)}ms`);
console.log(`\nCategory Breakdown:`);
for (const [category, stats] of Object.entries(report.categoryBreakdown)) {
const percentage = (stats.score / stats.maxScore) * 100;
console.log(` ${category}:`);
console.log(` Passed: ${stats.passed}, Failed: ${stats.failed}`);
console.log(` Score: ${stats.score}/${stats.maxScore} (${percentage.toFixed(2)}%)`);
}
console.log(`\nPerformance Metrics by Tool:`);
for (const metric of report.performanceMetrics) {
console.log(` ${metric.toolName}:`);
console.log(` Average Time: ${metric.averageTime.toFixed(2)}ms`);
console.log(` Min/Max: ${metric.minTime}ms / ${metric.maxTime}ms`);
console.log(` Success Rate: ${metric.successRate.toFixed(2)}%`);
}
console.log(`\nDetailed Results:`);
for (const result of report.testResults) {
const testCase = this.testCases.find(tc => tc.id === result.testId);
const status = result.passed ? 'โ
PASS' : 'โ FAIL';
console.log(`\n ${status} ${result.testId}: ${testCase.description}`);
console.log(` Score: ${result.score}/${result.maxScore} | Time: ${result.responseTime}ms`);
if (result.errors.length > 0) {
console.log(` Errors:`);
result.errors.forEach(e => console.log(` - ${e}`));
}
if (result.warnings.length > 0) {
console.log(` Warnings:`);
result.warnings.forEach(w => console.log(` - ${w}`));
}
}
console.log('\n' + '='.repeat(80) + '\n');
}
/**
* Save report to JSON file
*/
async saveReport(report, filepath) {
await fs.writeFile(filepath, JSON.stringify(report, null, 2));
console.log(`๐ Report saved to: ${filepath}`);
}
/**
* Cleanup resources
*/
async destroy() {
if (this.browser) {
await this.browser.close();
this.browser = null;
}
}
}
// ============================================================================
// CLI Runner
// ============================================================================
export async function runEvaluation(outputPath) {
const engine = new EvaluationEngine();
try {
await engine.initialize();
const report = await engine.runAllTests();
engine.printReport(report);
if (outputPath) {
await engine.saveReport(report, outputPath);
}
else {
const defaultPath = path.join(process.cwd(), 'eval', `evaluation-report-${Date.now()}.json`);
await engine.saveReport(report, defaultPath);
}
}
finally {
await engine.destroy();
}
}
// Run if executed directly
if (require.main === module) {
runEvaluation().catch(console.error);
}
//# sourceMappingURL=evaluation.js.map