Smart-AI-Bridge

generate-file-handler.js•26.4 KiB

/** * GenerateFileHandler - Local LLM Code Generation * * Purpose: Local LLM generates code from spec. Claude reviews or auto-approves. * Token savings: 500+ → ~50 tokens (spec only) * * Flow: * 1. Claude sends: generate_file("JWT auth middleware with rate limiting", "src/middleware/auth.ts") * 2. Local LLM generates production code * 3. If review=true → returns content for Claude approval * 4. If review=false → writes directly, returns summary */ import { BaseHandler } from './base-handler.js'; import { promises as fs } from 'fs'; import path from 'path'; import { getLocalContextLimit } from '../utils/model-discovery.js'; // Backend selection based on code complexity/type const GENERATION_BACKEND_MAP = { simple: 'local', // Quick scaffolding standard: 'nvidia_qwen', // Production code complex: 'nvidia_deepseek', // Complex logic test: 'nvidia_qwen' // Test generation }; // Retry configuration for resilient generation const RETRY_CONFIG = { maxLocalRetries: 2, // Try local with increasing tokens maxDualModeIterations: 3, // Coding + reasoning model iterations tokenScaleFactor: 1.5, // Multiply tokens on retry cloudFallbackEnabled: true, // Enable cloud fallback by default truncationThreshold: 0.7 // If output < 70% expected, likely truncated }; export class GenerateFileHandler extends BaseHandler { /** * Execute file generation using local LLM * @param {Object} args - Generation arguments * @param {string} args.spec - Natural language specification * @param {string} args.outputPath - Where to write the file * @param {Object} [args.options] - Optional configuration * @param {string} [args.options.backend] - Force specific backend * @param {string} [args.options.modelProfile] - Local router model profile (coding-qwen-7b|coding-seed-coder|fast-qwen14b|etc) * @param {boolean} [args.options.review] - Return for approval (default: true) * @param {string[]} [args.options.contextFiles] - Related files for style/patterns * @param {string} [args.options.language] - Language (auto-detect if not specified) * @param {boolean} [args.options.includeTests] - Also generate tests * @returns {Promise<Object>} Generation result */ async execute(args) { const { spec, outputPath, options = {} } = args; if (!spec) { throw new Error('spec is required'); } if (!outputPath) { throw new Error('outputPath is required'); } const { backend = 'auto', modelProfile = null, // For local router model selection review = true, contextFiles = [], language = null, includeTests = false } = options; const startTime = Date.now(); try { // 1. Resolve output path const absolutePath = path.isAbsolute(outputPath) ? outputPath : path.resolve(outputPath); // 2. Detect language from path or explicit setting const detectedLanguage = language || this.detectLanguage(outputPath); // 3. Check if file already exists let existingContent = null; try { existingContent = await fs.readFile(absolutePath, 'utf8'); console.error(`[GenerateFile] ⚠️ File exists, will overwrite: ${outputPath}`); } catch { // File doesn't exist, that's fine } // 4. Gather context from related files const contextContents = await this.gatherContextFiles(contextFiles); // 5. Estimate complexity for backend selection const complexity = this.estimateComplexity(spec, includeTests); // 6. Build the generation prompt const prompt = this.buildGenerationPrompt(spec, { outputPath, language: detectedLanguage, contextContents, includeTests, existingContent }); // 7. Determine the backend let selectedBackend = this.selectBackend(backend, complexity); // INPUT size limit check (local llama.cpp server configured limit) // Get dynamic context limit from loaded model const { charLimit: MAX_LOCAL_INPUT_CHARS, model: loadedModel } = await getLocalContextLimit(); console.error(`[${this.constructor.name}] 📊 Dynamic limit: ${MAX_LOCAL_INPUT_CHARS} chars (model: ${loadedModel})`); if (prompt.length > MAX_LOCAL_INPUT_CHARS && selectedBackend.startsWith('local')) { console.error(`[GenerateFile] ⚠️ Prompt size (${prompt.length} chars) exceeds local server limit (${MAX_LOCAL_INPUT_CHARS} chars)`); console.error(`[GenerateFile] 🔄 Auto-fallback to nvidia_qwen (128K context)`); selectedBackend = 'nvidia_qwen'; // Fast cloud alternative with 128K context } console.error(`[GenerateFile] 📝 Generating ${outputPath}`); console.error(`[GenerateFile] 🎯 Backend: ${selectedBackend}, Language: ${detectedLanguage}${modelProfile ? `, Model: ${modelProfile}` : ''}`); console.error(`[GenerateFile] 📋 Spec: ${spec.substring(0, 100)}...`); // 8. Calculate dynamic token allocation and timeout const promptLength = prompt.length; const maxTokens = this.calculateDynamicTokens(selectedBackend, promptLength, complexity); const timeoutMs = this.calculateDynamicTimeout(selectedBackend, maxTokens); // Check if context limit exceeded const contextLimit = this.getBackendContextLimit(selectedBackend); if (promptLength > contextLimit * 0.9) { // 90% threshold throw new Error( `Generation prompt (${promptLength} chars) exceeds ${selectedBackend} context limit (${contextLimit} chars). ` + `Consider reducing spec, context files, or using a backend with larger context.` ); } console.error(`[GenerateFile] 📊 Allocation: ${maxTokens} tokens, ${(timeoutMs / 1000).toFixed(1)}s timeout`); // 9. Execute with retry logic for truncation handling // Now checks BOTH finish_reason AND code structure inside the loop let response; let generated; let currentTokens = maxTokens; let attempts = 0; let usedBackend = selectedBackend; let wasTruncated = false; while (attempts < RETRY_CONFIG.maxLocalRetries + 1) { attempts++; response = await this.makeRequest(prompt, usedBackend, { maxTokens: currentTokens, routerModel: modelProfile, timeout: timeoutMs }); // Parse the response immediately so we can check structure generated = this.parseGeneratedCode(response.content || response, detectedLanguage); // Check for truncation via BOTH finish_reason AND code structure const finishReason = response.metadata?.finishReason || response.finish_reason; const finishReasonTruncated = finishReason === 'length'; const structureTruncated = this.detectCodeTruncation(generated.code, spec); wasTruncated = finishReasonTruncated || structureTruncated; if (wasTruncated) { const truncationSource = finishReasonTruncated ? 'finish_reason: length' : 'code structure incomplete'; console.error(`[GenerateFile] ⚠️ Output truncated (${truncationSource}), attempt ${attempts}/${RETRY_CONFIG.maxLocalRetries + 1}`); // Try dual-mode iteration if available (local coding + reasoning) if (usedBackend === 'local' && attempts <= RETRY_CONFIG.maxLocalRetries) { const dualResult = await this.tryDualModeGeneration(prompt, currentTokens, modelProfile); if (dualResult.success && !dualResult.truncated) { response = dualResult.response; generated = this.parseGeneratedCode(response.content || response, detectedLanguage); // Re-check structure after dual mode wasTruncated = this.detectCodeTruncation(generated.code, spec); if (!wasTruncated) { console.error(`[GenerateFile] ✅ Dual-mode iteration succeeded`); break; } } // Scale up tokens for next attempt currentTokens = Math.min(Math.floor(currentTokens * RETRY_CONFIG.tokenScaleFactor), 16000); console.error(`[GenerateFile] 🔄 Scaling tokens to ${currentTokens} for retry`); continue; } // Cloud fallback - try cloud if local exhausted OR if already on cloud but still truncated if (RETRY_CONFIG.cloudFallbackEnabled) { if (usedBackend !== 'nvidia_qwen') { console.error(`[GenerateFile] 🌐 Falling back to cloud (nvidia_qwen)`); usedBackend = 'nvidia_qwen'; currentTokens = Math.min(currentTokens * 2, 16000); continue; } else if (attempts <= RETRY_CONFIG.maxLocalRetries) { // Already on cloud, scale up tokens and retry currentTokens = Math.min(Math.floor(currentTokens * RETRY_CONFIG.tokenScaleFactor), 16000); console.error(`[GenerateFile] 🔄 Cloud retry with ${currentTokens} tokens`); continue; } } } break; // Success or exhausted retries } const processingTime = Date.now() - startTime; // 11. Handle review mode (default) if (review) { this.recordExecution( { success: true, backend: usedBackend, processingTime, mode: 'review', retryAttempts: attempts, wasTruncated }, { tool: 'generate_file', taskType: 'generation', outputPath } ); return this.buildSuccessResponse({ status: wasTruncated ? 'pending_review_truncated' : 'pending_review', outputPath: absolutePath, content: generated.code, summary: generated.summary, linesOfCode: generated.code.split('\n').length, language: detectedLanguage, tests: generated.tests, backend_used: usedBackend, processing_time: processingTime, retry_attempts: attempts, was_truncated: wasTruncated, instructions: wasTruncated ? 'WARNING: Output may be truncated. Review carefully and consider regenerating with simpler spec.' : 'Review the generated code. Use write_files_atomic to save or modify as needed.' }); } // 12. Auto-write mode (review=false) // Ensure directory exists await fs.mkdir(path.dirname(absolutePath), { recursive: true }); await fs.writeFile(absolutePath, generated.code, 'utf8'); // Write tests if generated if (includeTests && generated.tests) { const testPath = this.getTestPath(absolutePath); await fs.mkdir(path.dirname(testPath), { recursive: true }); await fs.writeFile(testPath, generated.tests, 'utf8'); } this.recordExecution( { success: true, backend: usedBackend, processingTime, mode: 'write', retryAttempts: attempts, wasTruncated }, { tool: 'generate_file', taskType: 'generation', outputPath } ); return this.buildSuccessResponse({ status: wasTruncated ? 'written_truncated' : 'written', outputPath: absolutePath, summary: generated.summary, linesOfCode: generated.code.split('\n').length, language: detectedLanguage, testPath: includeTests && generated.tests ? this.getTestPath(absolutePath) : null, backend_used: usedBackend, processing_time: processingTime, retry_attempts: attempts, was_truncated: wasTruncated }); } catch (error) { console.error(`[GenerateFile] ❌ Error: ${error.message}`); throw error; } } /** * Gather content from context files for style matching */ async gatherContextFiles(contextPaths) { if (!contextPaths || contextPaths.length === 0) { return []; } const contextContents = []; for (const contextPath of contextPaths.slice(0, 3)) { // Limit to 3 context files try { const absPath = path.isAbsolute(contextPath) ? contextPath : path.resolve(contextPath); const content = await fs.readFile(absPath, 'utf8'); contextContents.push({ path: contextPath, content: content.substring(0, 8000) // Limit per-file context }); } catch (error) { console.error(`[GenerateFile] Warning: Could not read context file ${contextPath}: ${error.message}`); } } return contextContents; } /** * Build the generation prompt for the LLM */ buildGenerationPrompt(spec, options) { const { outputPath, language, contextContents, includeTests, existingContent } = options; let prompt = `You are a senior software engineer. Generate production-ready code based on this specification. SPECIFICATION: ${spec} OUTPUT FILE: ${outputPath} LANGUAGE: ${language} ${includeTests ? 'GENERATE TESTS: Yes' : ''} `; if (existingContent) { prompt += ` --- EXISTING FILE (will be replaced) --- ${existingContent.substring(0, 5000)} --- END EXISTING --- `; } if (contextContents && contextContents.length > 0) { prompt += '\n--- STYLE REFERENCE FILES ---\n'; prompt += 'Match the coding style, patterns, and conventions from these files:\n'; for (const ctx of contextContents) { prompt += `\n=== ${ctx.path} ===\n${ctx.content}\n`; } prompt += '--- END REFERENCE ---\n'; } prompt += ` REQUIREMENTS: 1. Generate complete, working, production-ready code 2. Include necessary imports and dependencies 3. Add appropriate error handling 4. Include JSDoc/docstrings for public APIs 5. Follow best practices for ${language} 6. Code should be ready to use without modification Respond with the following structure: SUMMARY: [1-2 sentence description of what was generated] CODE: \`\`\`${language} [Your generated code here] \`\`\` `; if (includeTests) { prompt += ` TESTS: \`\`\`${language} [Your generated test code here] \`\`\` `; } return prompt; } /** * Estimate complexity for backend selection */ estimateComplexity(spec, includeTests) { const lowerSpec = spec.toLowerCase(); // Check for complex patterns const complexPatterns = [ 'algorithm', 'oauth', 'authentication', 'encryption', 'database', 'migration', 'websocket', 'real-time', 'concurrent', 'async', 'streaming', 'cache' ]; const simplePatterns = [ 'utility', 'helper', 'constant', 'config', 'type', 'interface', 'enum', 'model' ]; const hasComplex = complexPatterns.some(p => lowerSpec.includes(p)); const hasSimple = simplePatterns.some(p => lowerSpec.includes(p)); if (hasComplex || includeTests) return 'complex'; if (hasSimple) return 'simple'; return 'standard'; } /** * Select the appropriate backend */ selectBackend(requestedBackend, complexity) { if (requestedBackend && requestedBackend !== 'auto') { const backendMap = { local: 'local', deepseek: 'nvidia_deepseek', qwen3: 'nvidia_qwen', gemini: 'gemini', groq: 'groq_llama' }; return backendMap[requestedBackend] || requestedBackend; } return GENERATION_BACKEND_MAP[complexity] || 'local'; } /** * Parse the LLM response to extract code and tests */ parseGeneratedCode(responseText, language) { let summary = 'Code generated successfully'; let code = ''; let tests = null; // Extract summary const summaryMatch = responseText.match(/SUMMARY:\s*(.+?)(?=\n\nCODE:|CODE:|\n```)/is); if (summaryMatch) { summary = summaryMatch[1].trim(); } // Extract main code block const codePattern = new RegExp(`CODE:\\s*\`\`\`(?:${language})?\\n([\\s\\S]*?)\`\`\``, 'i'); const codeMatch = responseText.match(codePattern); if (codeMatch) { code = codeMatch[1].trim(); } else { // Try to find any code block const anyCodeMatch = responseText.match(/```(?:\w+)?\n([\s\S]*?)```/); if (anyCodeMatch) { code = anyCodeMatch[1].trim(); } else { // Last resort: treat entire response as code (excluding SUMMARY line) code = responseText.replace(/SUMMARY:.*\n?/i, '').trim(); } } // Extract test code block const testPattern = new RegExp(`TESTS:\\s*\`\`\`(?:${language})?\\n([\\s\\S]*?)\`\`\``, 'i'); const testMatch = responseText.match(testPattern); if (testMatch) { tests = testMatch[1].trim(); } return { summary, code, tests }; } /** * Get the test file path based on source file */ getTestPath(sourcePath) { const dir = path.dirname(sourcePath); const ext = path.extname(sourcePath); const base = path.basename(sourcePath, ext); // Common test file naming conventions return path.join(dir, '__tests__', `${base}.test${ext}`); } /** * Get context limit for a backend (in characters, ~4 chars per token) * @param {string} backendName - Backend identifier * @returns {number} Context limit in characters */ getBackendContextLimit(backendName) { // Context limits in tokens, converted to chars (~4 chars/token) const contextLimits = { 'local': 512000, // 128K tokens * 4 = 512K chars (YARN extended) 'local': 512000, // Same - dual mode local 'local': 512000, // Same - dual mode local 'nvidia_deepseek': 128000, // 32K tokens * 4 = 128K chars 'nvidia_qwen': 128000, // 32K tokens * 4 = 128K chars 'gemini': 128000, // 32K tokens * 4 = 128K chars 'groq_llama': 128000, // 32K tokens * 4 = 128K chars 'chatgpt': 512000 // 128K tokens * 4 = 512K chars }; return contextLimits[backendName] || 128000; // Default 32K tokens } /** * Estimate tokens per second for a backend * @param {string} backendName - Backend identifier (local, nvidia_qwen, etc.) * @returns {number} Estimated tokens/second */ estimateBackendSpeed(backendName) { // Backend speed estimates (tokens/sec) const backendSpeeds = { 'local': 20, // Conservative estimate for local models 'nvidia_deepseek': 40, // Cloud DeepSeek V3 'nvidia_qwen': 35, // Cloud Qwen3 480B 'gemini': 50, // Gemini Flash 'groq_llama': 80, // Ultra-fast Groq 'chatgpt': 40 // OpenAI GPT-4 }; return backendSpeeds[backendName] || 20; // Default 20 tokens/sec } /** * Calculate dynamic token allocation based on backend speed, generation complexity, * and REMAINING context window (to prevent overflow) * @param {string} backendName - Backend identifier * @param {number} promptSize - Prompt size in characters * @param {string} complexity - Generation complexity (simple|standard|complex) * @returns {number} Allocated tokens for response */ calculateDynamicTokens(backendName, promptSize, complexity) { // INCREASED base tokens for generation (council recommended) const baseTokens = { simple: 1500, // Was 800 - Simple functions, utilities standard: 3000, // Was 1500 - Standard components, classes complex: 5000 // Was 2500 - Complex features, multiple files }; // Get base allocation for this complexity level let allocation = baseTokens[complexity] || baseTokens.standard; // Scale up for larger prompts (more context = potentially more code) const promptTokens = Math.ceil(promptSize / 4); if (promptTokens > 2000) { allocation = Math.min(allocation * 1.5, 8000); // Cap at 8K for very large specs (was 4K) } // Backend-specific adjustments const tokensPerSecond = this.estimateBackendSpeed(backendName); if (tokensPerSecond < 25) { // Slower backends (local) - reduce allocation slightly allocation = Math.floor(allocation * 0.9); } else if (tokensPerSecond > 60) { // Very fast backends (groq) - can handle more allocation = Math.floor(allocation * 1.2); } // SMART REMAINING CONTEXT CALCULATION (council recommended) // Don't request more tokens than the backend can provide const contextLimit = this.getBackendContextLimit(backendName); const safetyBuffer = 4000; // ~1000 tokens safety margin const maxPossible = Math.floor((contextLimit - promptSize - safetyBuffer) / 4); // chars to tokens // Cap allocation at what's physically possible if (maxPossible > 0 && allocation > maxPossible) { console.error(`[GenerateFile] 📊 Capping tokens from ${allocation} to ${maxPossible} (context limit)`); allocation = maxPossible; } // Ensure minimum and maximum bounds (increased max from 8000 to 16000) return Math.max(1500, Math.min(allocation, 16000)); } /** * Calculate dynamic timeout based on backend speed and allocated tokens * @param {string} backendName - Backend identifier * @param {number} maxTokens - Allocated response tokens * @returns {number} Timeout in milliseconds */ calculateDynamicTimeout(backendName, maxTokens) { const tokensPerSecond = this.estimateBackendSpeed(backendName); // Estimated generation time + 50% buffer + 10s base overhead const estimatedSeconds = (maxTokens / tokensPerSecond) * 1.5 + 10; // Ensure minimum 30s, maximum 5min const timeoutSeconds = Math.max(30, Math.min(estimatedSeconds, 300)); return Math.floor(timeoutSeconds * 1000); } /** * Try dual-mode generation: coding model generates, reasoning model reviews/fixes * This leverages both local models for iteration before falling back to cloud * @param {string} prompt - Generation prompt * @param {number} tokens - Token allocation * @param {string} modelProfile - Optional router model profile * @returns {Promise<{success: boolean, response: Object, truncated: boolean}>} */ async tryDualModeGeneration(prompt, tokens, modelProfile) { try { // Check if dual mode is available (ports 8087 coding, 8088 reasoning) const dualAvailable = await this.checkDualModeAvailable(); if (!dualAvailable) { return { success: false, response: null, truncated: true }; } console.error('[GenerateFile] 🔄 Attempting dual-mode iteration (coding + reasoning)'); // Step 1: Generate with coding model (port 8087) const scaledTokens = Math.min(tokens * RETRY_CONFIG.tokenScaleFactor, 12000); const codingResponse = await this.makeRequest(prompt, 'local', { maxTokens: scaledTokens, routerModel: modelProfile, timeout: 120000 }); const codingFinishReason = codingResponse.metadata?.finishReason; if (codingFinishReason === 'length') { // Still truncated, but let reasoning model try to complete console.error('[GenerateFile] 🔄 Coding model truncated, trying reasoning model to complete'); } const codingCode = codingResponse.content || codingResponse; // Step 2: Have reasoning model review and potentially fix/complete const reviewPrompt = `Review and complete this code if it appears truncated or incomplete. If the code looks complete, return it unchanged. If it's truncated, complete the remaining parts following the same patterns. ORIGINAL SPECIFICATION: ${prompt.substring(0, 2000)}... GENERATED CODE: ${codingCode} If complete, respond with: STATUS: COMPLETE CODE: \`\`\` [the code] \`\`\` If needs completion, respond with: STATUS: COMPLETED CODE: \`\`\` [the full completed code] \`\`\``; const reviewResponse = await this.makeRequest(reviewPrompt, 'local', { maxTokens: scaledTokens, timeout: 120000 }); const reviewText = reviewResponse.content || reviewResponse; const reviewFinishReason = reviewResponse.metadata?.finishReason; // Parse the review response const statusMatch = reviewText.match(/STATUS:\s*(COMPLETE|COMPLETED)/i); const codeMatch = reviewText.match(/CODE:\s*```[\w]*\n([\s\S]*?)```/i); if (codeMatch) { return { success: true, response: { content: codeMatch[1].trim(), metadata: { ...reviewResponse.metadata, dualModeUsed: true, codingModel: 'local', reasoningModel: 'local' } }, truncated: reviewFinishReason === 'length' }; } // Fallback: return the coding response return { success: true, response: codingResponse, truncated: codingFinishReason === 'length' }; } catch (error) { console.error(`[GenerateFile] ⚠️ Dual-mode failed: ${error.message}`); return { success: false, response: null, truncated: true }; } } /** * Check if dual-mode local backends are available * @returns {Promise<boolean>} */ async checkDualModeAvailable() { try { // Quick health check on dual ports const checks = await Promise.all([ fetch('http://localhost:8087/health', { signal: AbortSignal.timeout(1000) }).catch(() => null), fetch('http://localhost:8088/health', { signal: AbortSignal.timeout(1000) }).catch(() => null) ]); return checks[0]?.ok && checks[1]?.ok; } catch { return false; } } /** * Detect if generated code appears truncated based on structure * @param {string} code - Generated code * @param {string} spec - Original specification * @returns {boolean} */ detectCodeTruncation(code, spec) { if (!code || code.length < 50) return true; // Check for common truncation indicators const truncationMarkers = [ /\.\.\.$/, // Ends with ... /\/\/\s*\.\.\./, // Comment with ... /\/\*\s*\.\.\.\s*\*\/$/, // Block comment with ... /\/\/\s*(TODO|FIXME|incomplete)/i, // Incomplete markers at end /{\s*$/, // Unclosed brace at end /\(\s*$/, // Unclosed paren at end ]; const endsWithMarker = truncationMarkers.some(pattern => pattern.test(code.slice(-100)) ); // Check if spec mentions 'export' but code doesn't have complete exports const specMentionsExport = /export/i.test(spec); const hasCompleteExport = /export\s+(default\s+)?(class|function|const|let|var|interface|type)\s+\w+/i.test(code); const missingExport = specMentionsExport && !hasCompleteExport && code.length > 100; // Check brace balance const openBraces = (code.match(/\{/g) || []).length; const closeBraces = (code.match(/\}/g) || []).length; const unbalancedBraces = openBraces > closeBraces + 1; // Allow 1 unclosed for partial return endsWithMarker || missingExport || unbalancedBraces; } } export default GenerateFileHandler;

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Platano78/Smart-AI-Bridge'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

generate-file-handler.js•26.4 KiB