think-mcp

think
web

benchmark-mcp.mjs•21.3 KiB

#!/usr/bin/env node /** * MCP Server Performance Benchmark * * Compares current implementation against previous baselines. * Focuses on performance metrics, not UX improvements. * * Run: node benchmark-mcp.mjs [port] */ const PORT = process.argv[2] || 3000; const BASE_URL = `http://localhost:${PORT}/api/mcp`; // Previous baseline averages (from test-results/think-mcp-eval-results.jsonl) const BASELINE = { trace: { avgMs: 177, tests: 5 }, model: { avgMs: 196, tests: 6 }, pattern: { avgMs: 199, tests: 6 }, paradigm: { avgMs: 189, tests: 6 }, debug: { avgMs: 194, tests: 6 }, council: { avgMs: 300, tests: 2 }, decide: { avgMs: 229, tests: 3 }, reflect: { avgMs: 204, tests: 3 }, hypothesis: { avgMs: 249, tests: 3 }, debate: { avgMs: 187, tests: 4 }, map: { avgMs: 206, tests: 3 }, }; // Test scenarios extracted from previous baseline tests const TEST_SCENARIOS = { trace: [ { testId: 'trace-basic-001', args: { thought: 'Breaking down a complex software architecture decision', thoughtNumber: 1, totalThoughts: 5, nextThoughtNeeded: true } }, { testId: 'trace-edge-001', args: { thought: 'Revising earlier assumption', thoughtNumber: 3, totalThoughts: 5, nextThoughtNeeded: true, isRevision: true, revisesThought: 1 } }, { testId: 'trace-edge-002', args: { thought: 'Branching to explore alternative', thoughtNumber: 4, totalThoughts: 6, nextThoughtNeeded: true, branchFromThought: 2, branchId: 'alt-approach-1' } }, { testId: 'trace-edge-003', args: { thought: 'Final conclusion', thoughtNumber: 5, totalThoughts: 5, nextThoughtNeeded: false } }, ], model: [ { testId: 'model-basic-001', args: { modelName: 'first_principles', problem: 'Designing scalable microservices' } }, { testId: 'model-edge-001', args: { modelName: 'opportunity_cost', problem: 'Build vs buy decision' } }, { testId: 'model-edge-002', args: { modelName: 'rubber_duck', problem: 'Async function debugging' } }, { testId: 'model-edge-003', args: { modelName: 'pareto_principle', problem: 'Bug prioritization' } }, { testId: 'model-edge-004', args: { modelName: 'occams_razor', problem: 'Slow app diagnosis' } }, { testId: 'model-edge-005', args: { modelName: 'error_propagation', problem: 'Cascading failure analysis' } }, ], pattern: [ { testId: 'pattern-basic-001', args: { patternName: 'api_integration', context: 'REST API gateway design' } }, { testId: 'pattern-edge-001', args: { patternName: 'state_management', context: 'React form state' } }, { testId: 'pattern-edge-002', args: { patternName: 'async_processing', context: 'File upload with progress' } }, { testId: 'pattern-edge-003', args: { patternName: 'security', context: 'API authentication' } }, { testId: 'pattern-edge-004', args: { patternName: 'scalability', context: '10x traffic growth' } }, { testId: 'pattern-edge-005', args: { patternName: 'agentic_design', context: 'Autonomous AI agent' } }, ], paradigm: [ { testId: 'paradigm-basic-001', args: { paradigmName: 'functional', problem: 'Transaction processing' } }, { testId: 'paradigm-edge-001', args: { paradigmName: 'reactive', problem: 'Real-time dashboard' } }, { testId: 'paradigm-edge-002', args: { paradigmName: 'event_driven', problem: 'Microservices notifications' } }, { testId: 'paradigm-edge-003', args: { paradigmName: 'object_oriented', problem: 'Complex domain modeling' } }, { testId: 'paradigm-edge-004', args: { paradigmName: 'concurrent', problem: 'Parallel API processing' } }, ], debug: [ { testId: 'debug-basic-001', args: { approachName: 'binary_search', issue: 'Intermittent crashes after deployment' } }, { testId: 'debug-edge-001', args: { approachName: 'divide_conquer', issue: 'Memory leak investigation' } }, { testId: 'debug-edge-002', args: { approachName: 'cause_elimination', issue: 'Inconsistent login failures' } }, { testId: 'debug-edge-003', args: { approachName: 'backtracking', issue: 'Recursive algorithm edge case' } }, { testId: 'debug-edge-004', args: { approachName: 'program_slicing', issue: 'Unexpected variable value' } }, ], council: [ { testId: 'council-basic-001', args: { topic: 'Monolith to microservices migration', personas: [ { id: 'arch', name: 'Architect', expertise: ['system design'], background: 'Senior architect', perspective: 'Technical', biases: ['complexity'], communication: { style: 'formal', tone: 'analytical' } }, { id: 'dev', name: 'Developer', expertise: ['implementation'], background: 'Staff engineer', perspective: 'Practical', biases: ['simplicity'], communication: { style: 'direct', tone: 'pragmatic' } }, { id: 'pm', name: 'Product Manager', expertise: ['product'], background: 'Senior PM', perspective: 'Business', biases: ['features'], communication: { style: 'collaborative', tone: 'optimistic' } }, ], contributions: [], stage: 'ideation', activePersonaId: 'arch', sessionId: 'bench-001', iteration: 0, nextContributionNeeded: true }}, ], decide: [ { testId: 'decide-basic-001', args: { decisionStatement: 'Cloud provider selection', options: [ { name: 'AWS', description: 'Amazon Web Services' }, { name: 'GCP', description: 'Google Cloud Platform' }, { name: 'Azure', description: 'Microsoft Azure' }, ], analysisType: 'weighted-criteria', stage: 'evaluation', decisionId: 'bench-decide-001', iteration: 0, nextStageNeeded: true }}, { testId: 'decide-edge-001', args: { decisionStatement: 'GraphQL vs REST', options: [ { name: 'GraphQL', description: 'Query language for APIs' }, { name: 'REST', description: 'Representational State Transfer' }, ], analysisType: 'pros-cons', stage: 'options-generation', decisionId: 'bench-decide-002', iteration: 0, nextStageNeeded: true }}, ], reflect: [ { testId: 'reflect-basic-001', args: { task: 'System design assessment', stage: 'monitoring', overallConfidence: 0.7, uncertaintyAreas: ['scalability'], recommendedApproach: 'iterative review', monitoringId: 'bench-reflect-001', iteration: 0, nextAssessmentNeeded: true }}, ], hypothesis: [ { testId: 'hypothesis-basic-001', args: { stage: 'hypothesis', inquiryId: 'bench-hyp-001', observation: 'API response times increased 3x', iteration: 0, nextStageNeeded: true }}, { testId: 'hypothesis-edge-001', args: { stage: 'experiment', inquiryId: 'bench-hyp-002', iteration: 2, nextStageNeeded: true }}, ], debate: [ { testId: 'debate-basic-001', args: { claim: 'Serverless is superior', premises: ['Reduced operational overhead', 'Auto-scaling'], conclusion: 'Serverless should be default choice', argumentType: 'thesis', confidence: 0.75, nextArgumentNeeded: true }}, { testId: 'debate-edge-001', args: { claim: 'Containers provide best balance', premises: ['Full control', 'Portability'], conclusion: 'Containers offer more flexibility', argumentType: 'antithesis', confidence: 0.7, respondsTo: 'arg-001', nextArgumentNeeded: true }}, { testId: 'debate-edge-002', args: { claim: 'Hybrid approach optimizes both', premises: ['Combine strengths', 'Use case dependent'], conclusion: 'Use both strategically', argumentType: 'synthesis', confidence: 0.85, nextArgumentNeeded: false }}, ], map: [ { testId: 'map-basic-001', args: { operation: 'create', diagramId: 'bench-map-001', diagramType: 'flowchart', elements: [ { id: 'start', type: 'node', properties: { label: 'Start' } }, { id: 'process', type: 'node', properties: { label: 'Process' } }, { id: 'end', type: 'node', properties: { label: 'End' } }, { id: 'e1', type: 'edge', source: 'start', target: 'process', properties: {} }, { id: 'e2', type: 'edge', source: 'process', target: 'end', properties: {} }, ], iteration: 0, nextOperationNeeded: true }}, { testId: 'map-edge-001', args: { operation: 'observe', diagramId: 'bench-map-001', diagramType: 'flowchart', insight: 'Gateway pattern enables scaling', iteration: 1, nextOperationNeeded: false }}, ], }; // Results storage const results = { tools: {}, resources: null, prompts: null, validation: { passed: 0, failed: 0, errors: [] }, performance: { improved: 0, regressed: 0, neutral: 0 }, }; async function sendRequest(method, params = {}) { const start = performance.now(); try { const response = await fetch(BASE_URL, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Accept': 'application/json, text/event-stream', }, body: JSON.stringify({ jsonrpc: '2.0', id: Date.now(), method, params, }), }); const text = await response.text(); const durationMs = performance.now() - start; try { const json = JSON.parse(text); return { ...json, durationMs, raw: text }; } catch { // Handle SSE response const lines = text.trim().split('\n'); for (let i = lines.length - 1; i >= 0; i--) { if (lines[i].startsWith('data: ')) { return { ...JSON.parse(lines[i].slice(6)), durationMs, raw: text }; } } return { error: { message: 'Failed to parse response' }, durationMs }; } } catch (error) { return { error: { message: error.message }, durationMs: performance.now() - start }; } } /** * Validate standardized response format (PR 12) */ function validateResponseFormat(response, toolName) { const issues = []; if (!response.result) { issues.push('Missing result object'); return { valid: false, issues }; } const { result } = response; // Check for standardized response wrapper if (typeof result.success !== 'boolean') { issues.push('Missing or invalid "success" field'); } if (result.tool !== toolName) { issues.push(`Tool mismatch: expected "${toolName}", got "${result.tool}"`); } if (!result.data) { issues.push('Missing "data" field'); } if (!result.metadata) { issues.push('Missing "metadata" field'); } else { if (typeof result.metadata.processingTimeMs !== 'number') { issues.push('Missing metadata.processingTimeMs'); } if (result.metadata.version !== '2.0.0') { issues.push(`Version mismatch: expected "2.0.0", got "${result.metadata.version}"`); } if (!result.metadata.timestamp) { issues.push('Missing metadata.timestamp'); } if (!result.metadata.requestId) { issues.push('Missing metadata.requestId'); } } return { valid: issues.length === 0, issues }; } async function benchmarkTool(toolName, scenarios) { const toolResults = { name: toolName, tests: [], avgMs: 0, minMs: Infinity, maxMs: 0, validationPassed: 0, validationFailed: 0, }; for (const scenario of scenarios) { const response = await sendRequest('tools/call', { name: toolName, arguments: scenario.args, }); const validation = validateResponseFormat(response, toolName); if (validation.valid) { toolResults.validationPassed++; results.validation.passed++; } else { toolResults.validationFailed++; results.validation.failed++; results.validation.errors.push({ testId: scenario.testId, issues: validation.issues }); } const testResult = { testId: scenario.testId, durationMs: Math.round(response.durationMs * 100) / 100, success: response.result?.success ?? false, validationPassed: validation.valid, issues: validation.issues, }; toolResults.tests.push(testResult); toolResults.avgMs += response.durationMs; toolResults.minMs = Math.min(toolResults.minMs, response.durationMs); toolResults.maxMs = Math.max(toolResults.maxMs, response.durationMs); } toolResults.avgMs = Math.round((toolResults.avgMs / scenarios.length) * 100) / 100; toolResults.minMs = Math.round(toolResults.minMs * 100) / 100; toolResults.maxMs = Math.round(toolResults.maxMs * 100) / 100; return toolResults; } async function benchmarkResources() { const start = performance.now(); const response = await sendRequest('resources/list'); const listDuration = performance.now() - start; if (response.error) { return { error: response.error.message }; } const resourceCount = response.result?.resources?.length || 0; // Test reading a resource const readStart = performance.now(); const readResponse = await sendRequest('resources/read', { uri: 'think://models' }); const readDuration = performance.now() - readStart; return { count: resourceCount, listDurationMs: Math.round(listDuration * 100) / 100, readDurationMs: Math.round(readDuration * 100) / 100, readSuccess: !readResponse.error, }; } async function benchmarkPrompts() { const start = performance.now(); const response = await sendRequest('prompts/list'); const listDuration = performance.now() - start; if (response.error) { return { error: response.error.message }; } const promptCount = response.result?.prompts?.length || 0; // Test getting a prompt const getStart = performance.now(); const getResponse = await sendRequest('prompts/get', { name: 'analyze-problem', arguments: { problem: 'Benchmark test', context: 'Performance testing' } }); const getDuration = performance.now() - getStart; return { count: promptCount, listDurationMs: Math.round(listDuration * 100) / 100, getDurationMs: Math.round(getDuration * 100) / 100, getSuccess: !getResponse.error && getResponse.result?.messages?.length > 0, }; } function compareWithBaseline(toolName, currentAvg) { const baseline = BASELINE[toolName]; if (!baseline) return { status: 'no-baseline', delta: 0 }; const delta = currentAvg - baseline.avgMs; const percentChange = ((delta / baseline.avgMs) * 100).toFixed(1); if (delta < -5) { results.performance.improved++; return { status: 'improved', delta, percentChange: `-${Math.abs(percentChange)}%`, baseline: baseline.avgMs }; } else if (delta > 10) { results.performance.regressed++; return { status: 'regressed', delta, percentChange: `+${percentChange}%`, baseline: baseline.avgMs }; } else { results.performance.neutral++; return { status: 'neutral', delta, percentChange: `${delta >= 0 ? '+' : ''}${percentChange}%`, baseline: baseline.avgMs }; } } async function main() { console.log('═══════════════════════════════════════════════════════════════════════'); console.log(' think-mcp Performance Benchmark'); console.log('═══════════════════════════════════════════════════════════════════════'); console.log(` Server: ${BASE_URL}`); console.log(` Date: ${new Date().toISOString()}`); console.log('═══════════════════════════════════════════════════════════════════════\n'); try { // Benchmark all tools console.log('🔧 TOOL PERFORMANCE BENCHMARKS\n'); console.log('┌────────────┬──────────┬──────────┬──────────┬────────────┬─────────────────┐'); console.log('│ Tool │ Avg (ms) │ Min (ms) │ Max (ms) │ Baseline │ Change │'); console.log('├────────────┼──────────┼──────────┼──────────┼────────────┼─────────────────┤'); for (const [toolName, scenarios] of Object.entries(TEST_SCENARIOS)) { const toolResults = await benchmarkTool(toolName, scenarios); results.tools[toolName] = toolResults; const comparison = compareWithBaseline(toolName, toolResults.avgMs); const statusIcon = comparison.status === 'improved' ? '✅' : comparison.status === 'regressed' ? '❌' : '➖'; console.log(`│ ${toolName.padEnd(10)} │ ${toolResults.avgMs.toString().padStart(8)} │ ${toolResults.minMs.toString().padStart(8)} │ ${toolResults.maxMs.toString().padStart(8)} │ ${(comparison.baseline || 'N/A').toString().padStart(10)} │ ${statusIcon} ${(comparison.percentChange || 'N/A').padStart(12)} │`); } console.log('└────────────┴──────────┴──────────┴──────────┴────────────┴─────────────────┘\n'); // Benchmark resources console.log('📚 RESOURCES BENCHMARK\n'); results.resources = await benchmarkResources(); if (results.resources.error) { console.log(` ❌ Error: ${results.resources.error}`); } else { console.log(` Resources found: ${results.resources.count}`); console.log(` List duration: ${results.resources.listDurationMs}ms`); console.log(` Read duration: ${results.resources.readDurationMs}ms`); console.log(` Read success: ${results.resources.readSuccess ? '✅' : '❌'}`); } console.log(); // Benchmark prompts console.log('💬 PROMPTS BENCHMARK\n'); results.prompts = await benchmarkPrompts(); if (results.prompts.error) { console.log(` ❌ Error: ${results.prompts.error}`); } else { console.log(` Prompts found: ${results.prompts.count}`); console.log(` List duration: ${results.prompts.listDurationMs}ms`); console.log(` Get duration: ${results.prompts.getDurationMs}ms`); console.log(` Get success: ${results.prompts.getSuccess ? '✅' : '❌'}`); } console.log(); // Validation summary console.log('✅ STANDARDIZED RESPONSE VALIDATION (PR 12)\n'); console.log(` Passed: ${results.validation.passed}`); console.log(` Failed: ${results.validation.failed}`); if (results.validation.errors.length > 0) { console.log(' Errors:'); results.validation.errors.forEach(e => { console.log(` - ${e.testId}: ${e.issues.join(', ')}`); }); } console.log(); // Performance summary console.log('═══════════════════════════════════════════════════════════════════════'); console.log(' PERFORMANCE COMPARISON SUMMARY'); console.log('═══════════════════════════════════════════════════════════════════════'); console.log(` ✅ Improved: ${results.performance.improved} tools`); console.log(` ➖ Neutral: ${results.performance.neutral} tools`); console.log(` ❌ Regressed: ${results.performance.regressed} tools`); console.log(); // Overall assessment const totalTests = Object.values(results.tools).reduce((sum, t) => sum + t.tests.length, 0); const avgOverallMs = Object.values(results.tools).reduce((sum, t) => sum + t.avgMs, 0) / Object.keys(results.tools).length; const baselineAvg = Object.values(BASELINE).reduce((sum, b) => sum + b.avgMs, 0) / Object.keys(BASELINE).length; const overallChange = ((avgOverallMs - baselineAvg) / baselineAvg * 100).toFixed(1); console.log('═══════════════════════════════════════════════════════════════════════'); console.log(' OVERALL ASSESSMENT'); console.log('═══════════════════════════════════════════════════════════════════════'); console.log(` Total tests run: ${totalTests}`); console.log(` Average response time: ${avgOverallMs.toFixed(2)}ms`); console.log(` Baseline average: ${baselineAvg.toFixed(2)}ms`); console.log(` Overall change: ${overallChange >= 0 ? '+' : ''}${overallChange}%`); console.log(); if (results.validation.failed === 0 && results.performance.regressed === 0) { console.log('🎉 All tests passed! No regressions detected.\n'); } else if (results.performance.regressed > 0) { console.log('⚠️ Performance regressions detected. Review above for details.\n'); } // Save results to file const outputPath = `../test-results/benchmark-${new Date().toISOString().split('T')[0]}.json`; const fs = await import('fs'); fs.writeFileSync( new URL(outputPath, import.meta.url), JSON.stringify(results, null, 2) ); console.log(`📊 Results saved to: test-results/benchmark-${new Date().toISOString().split('T')[0]}.json`); } catch (error) { console.error('\n❌ Benchmark failed:', error.message); process.exit(1); } } main();

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/letsgomaslow/think'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

benchmark-mcp.mjs•21.3 KiB