Skip to main content
Glama
test-nlp-full-execution.js12.8 kB
const { spawn } = require('child_process'); const fs = require('fs'); const path = require('path'); /** * Full Execution NLP Test Framework * * This framework tests the complete pipeline: * 1. Natural language input * 2. LLM tool selection * 3. MCP tool execution * 4. REAPER command execution * 5. Response quality evaluation */ class FullExecutionNLPTester { constructor() { this.mcpProcess = null; this.results = []; this.mcpServerPath = process.env.VITE_MCP_SERVER_PATH || '/Users/stevehiehn/total-reaper-mcp'; this.apiKey = process.env.OPENAI_API_KEY; } /** * Start MCP server */ async startMCPServer() { console.log('Starting MCP server...'); return new Promise((resolve, reject) => { this.mcpProcess = spawn('python', [ '-m', 'server.app', '--transport', 'stdio' ], { cwd: this.mcpServerPath, env: { ...process.env, PYTHONPATH: this.mcpServerPath } }); this.mcpProcess.stdout.on('data', (data) => { const output = data.toString(); if (output.includes('Server started')) { console.log('MCP server started successfully'); resolve(); } }); this.mcpProcess.stderr.on('data', (data) => { console.error('MCP stderr:', data.toString()); }); this.mcpProcess.on('error', (error) => { console.error('Failed to start MCP server:', error); reject(error); }); // Give it time to start setTimeout(() => resolve(), 3000); }); } /** * Call MCP tool directly */ async callMCPTool(toolName, params = {}) { return new Promise((resolve, reject) => { const request = { jsonrpc: '2.0', method: 'tools/call', params: { name: toolName, arguments: params }, id: Date.now() }; this.mcpProcess.stdin.write(JSON.stringify(request) + '\n'); const responseHandler = (data) => { const lines = data.toString().split('\n'); for (const line of lines) { if (!line.trim()) continue; try { const response = JSON.parse(line); if (response.id === request.id) { this.mcpProcess.stdout.removeListener('data', responseHandler); if (response.error) { reject(new Error(response.error.message)); } else { resolve(response.result); } } } catch (e) { // Not JSON, ignore } } }; this.mcpProcess.stdout.on('data', responseHandler); // Timeout after 5 seconds setTimeout(() => { this.mcpProcess.stdout.removeListener('data', responseHandler); reject(new Error('MCP tool call timeout')); }, 5000); }); } /** * Get LLM to select tool and parameters */ async getLLMToolSelection(userInput, tools) { const systemPrompt = fs.readFileSync( path.join(__dirname, 'improved-system-prompt.md'), 'utf8' ).split('```')[1]; const response = await fetch('https://api.openai.com/v1/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}` }, body: JSON.stringify({ model: 'gpt-4-turbo-preview', messages: [ { role: 'system', content: systemPrompt }, { role: 'user', content: userInput } ], tools: tools, tool_choice: 'auto', temperature: 0.1 }) }); const data = await response.json(); if (data.error) { throw new Error(data.error.message); } const message = data.choices[0].message; return { content: message.content, tool_calls: message.tool_calls || [] }; } /** * Evaluate response quality */ async evaluateResponse(userInput, toolCalled, toolResult, llmResponse) { const evaluationPrompt = ` You are evaluating a DAW assistant's response quality. User asked: "${userInput}" Tool called: ${toolCalled} Tool result: ${JSON.stringify(toolResult)} Assistant's response: ${llmResponse} Evaluate on these criteria (0-10 scale): 1. Answered Question: Did it directly answer what the user asked? 2. Correct Tool: Was the right tool selected for the task? 3. Complete Response: Was all necessary information provided? 4. User Friendly: Was the response clear and non-technical? Respond with JSON only: { "answered_question": 0-10, "correct_tool": 0-10, "complete_response": 0-10, "user_friendly": 0-10, "issues": ["list of specific issues"], "suggestions": ["list of improvements"] }`; const response = await fetch('https://api.openai.com/v1/chat/completions', { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${this.apiKey}` }, body: JSON.stringify({ model: 'gpt-4-turbo-preview', messages: [ { role: 'system', content: 'You are a quality evaluator. Respond with JSON only.' }, { role: 'user', content: evaluationPrompt } ], temperature: 0.1 }) }); const data = await response.json(); try { return JSON.parse(data.choices[0].message.content); } catch (e) { console.error('Failed to parse evaluation:', data.choices[0].message.content); return null; } } /** * Run a single test */ async runTest(testCase) { console.log(`\nTesting: "${testCase.input}"`); try { // Step 1: Get LLM tool selection const llmResponse = await this.getLLMToolSelection( testCase.input, this.getToolDefinitions() ); let toolResult = null; let toolCalled = 'none'; if (llmResponse.tool_calls && llmResponse.tool_calls.length > 0) { const toolCall = llmResponse.tool_calls[0]; toolCalled = toolCall.function.name; const args = JSON.parse(toolCall.function.arguments); console.log(` Tool selected: ${toolCalled}`); console.log(` Parameters: ${JSON.stringify(args)}`); // Step 2: Execute MCP tool try { toolResult = await this.callMCPTool(toolCalled, args); console.log(` Tool result: ${JSON.stringify(toolResult).substring(0, 100)}...`); } catch (error) { console.error(` Tool execution failed: ${error.message}`); toolResult = { error: error.message }; } } // Step 3: Evaluate response quality const evaluation = await this.evaluateResponse( testCase.input, toolCalled, toolResult, llmResponse.content || 'No response content' ); // Calculate overall score const scores = evaluation ? [ evaluation.answered_question, evaluation.correct_tool, evaluation.complete_response, evaluation.user_friendly ] : [0, 0, 0, 0]; const overallScore = scores.reduce((a, b) => a + b, 0) / scores.length; const result = { testCase, toolCalled, toolResult, llmResponse: llmResponse.content, evaluation, overallScore, success: overallScore >= 7 }; this.results.push(result); console.log(` Overall score: ${overallScore.toFixed(1)}/10`); if (evaluation && evaluation.issues.length > 0) { console.log(` Issues: ${evaluation.issues.join(', ')}`); } return result; } catch (error) { console.error(` Test failed: ${error.message}`); const result = { testCase, error: error.message, success: false, overallScore: 0 }; this.results.push(result); return result; } } /** * Run all tests */ async runTests(testCases) { await this.startMCPServer(); console.log('Running full execution tests...\n'); for (const testCase of testCases) { await this.runTest(testCase); // Rate limiting await new Promise(resolve => setTimeout(resolve, 1000)); } return this.generateReport(); } /** * Generate detailed report */ generateReport() { const totalTests = this.results.length; const successfulTests = this.results.filter(r => r.success).length; const averageScore = this.results.reduce((sum, r) => sum + r.overallScore, 0) / totalTests; console.log('\n=== Full Execution Test Report ==='); console.log(`Total Tests: ${totalTests}`); console.log(`Successful: ${successfulTests} (${(successfulTests/totalTests*100).toFixed(1)}%)`); console.log(`Average Score: ${averageScore.toFixed(1)}/10`); // Group by issue type const issueFrequency = {}; this.results.forEach(r => { if (r.evaluation && r.evaluation.issues) { r.evaluation.issues.forEach(issue => { issueFrequency[issue] = (issueFrequency[issue] || 0) + 1; }); } }); console.log('\n=== Common Issues ==='); Object.entries(issueFrequency) .sort((a, b) => b[1] - a[1]) .slice(0, 5) .forEach(([issue, count]) => { console.log(`- ${issue}: ${count} occurrences`); }); // Low scoring tests console.log('\n=== Tests Needing Improvement ==='); this.results .filter(r => r.overallScore < 7) .slice(0, 5) .forEach(r => { console.log(`\n"${r.testCase.input}" (Score: ${r.overallScore.toFixed(1)})`); if (r.evaluation) { console.log(` Issues: ${r.evaluation.issues.join(', ')}`); console.log(` Suggestions: ${r.evaluation.suggestions.join(', ')}`); } }); // Save detailed results fs.writeFileSync( 'nlp-full-execution-results.json', JSON.stringify(this.results, null, 2) ); console.log('\nDetailed results saved to nlp-full-execution-results.json'); return { summary: { totalTests, successfulTests, successRate: successfulTests / totalTests, averageScore }, results: this.results }; } /** * Get tool definitions */ getToolDefinitions() { // Simplified for testing - in production, fetch from MCP return [ { type: 'function', function: { name: 'dsl_list_tracks', description: 'Show what\'s in your project. Use when users ask what tracks exist or want an overview.', parameters: { type: 'object', properties: {} } } }, { type: 'function', function: { name: 'dsl_track_create', description: 'Add a new instrument, voice, or sound to your project.', parameters: { type: 'object', properties: { name: { type: 'string', description: 'Track name' }, role: { type: 'string', description: 'Track role (bass, drums, etc)' } } } } }, { type: 'function', function: { name: 'dsl_track_volume', description: 'Make sounds louder or quieter.', parameters: { type: 'object', properties: { track: { type: ['string', 'integer'], description: 'Track reference' }, volume: { type: ['number', 'string'], description: 'Volume value' } }, required: ['track', 'volume'] } } } // Add more tools as needed ]; } /** * Cleanup */ cleanup() { if (this.mcpProcess) { console.log('\nStopping MCP server...'); this.mcpProcess.kill(); } } } // Test cases focusing on response quality const testCases = [ { input: "what are the names of the tracks?", expectedInfo: "track names" }, { input: "create a bass track", expectedInfo: "confirmation of track creation" }, { input: "make track 1 louder", expectedInfo: "volume adjustment confirmation" }, { input: "I need something for the drums", expectedInfo: "drum track creation" }, { input: "show me what I have", expectedInfo: "project overview with track details" } ]; // Main execution async function main() { const tester = new FullExecutionNLPTester(); if (!tester.apiKey) { console.error('Error: OPENAI_API_KEY environment variable not set'); process.exit(1); } try { await tester.runTests(testCases); } catch (error) { console.error('Test runner error:', error); } finally { tester.cleanup(); } } // Handle cleanup on exit process.on('SIGINT', () => { console.log('\nInterrupted, cleaning up...'); process.exit(0); }); if (require.main === module) { main(); } module.exports = { FullExecutionNLPTester };

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shiehn/total-reaper-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server