Houtini-lm

index.ts•12.4 KiB

#!/usr/bin/env node /** * Houtini LM — MCP Server for Local LLMs via OpenAI-compatible API * * Connects to LM Studio (or any OpenAI-compatible endpoint) and exposes * chat, custom prompts, and model info as MCP tools. */ import { Server } from '@modelcontextprotocol/sdk/server/index.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js'; const LM_BASE_URL = process.env.LM_STUDIO_URL || 'http://localhost:1234'; const LM_MODEL = process.env.LM_STUDIO_MODEL || ''; const LM_PASSWORD = process.env.LM_STUDIO_PASSWORD || ''; const DEFAULT_MAX_TOKENS = 4096; const DEFAULT_TEMPERATURE = 0.3; function apiHeaders(): Record<string, string> { const h: Record<string, string> = { 'Content-Type': 'application/json' }; if (LM_PASSWORD) h['Authorization'] = `Bearer ${LM_PASSWORD}`; return h; } // ── OpenAI-compatible API helpers ──────────────────────────────────── interface ChatMessage { role: 'system' | 'user' | 'assistant'; content: string; } interface ChatCompletionResponse { id: string; choices: Array<{ message: { role: string; content: string }; finish_reason: string; }>; model: string; usage?: { prompt_tokens: number; completion_tokens: number; total_tokens: number }; } async function chatCompletion( messages: ChatMessage[], options: { temperature?: number; maxTokens?: number; model?: string } = {}, ): Promise<ChatCompletionResponse> { const body: Record<string, unknown> = { messages, temperature: options.temperature ?? DEFAULT_TEMPERATURE, max_tokens: options.maxTokens ?? DEFAULT_MAX_TOKENS, stream: false, }; if (options.model || LM_MODEL) { body.model = options.model || LM_MODEL; } const res = await fetch(`${LM_BASE_URL}/v1/chat/completions`, { method: 'POST', headers: apiHeaders(), body: JSON.stringify(body), }); if (!res.ok) { const text = await res.text().catch(() => ''); throw new Error(`LM Studio API error ${res.status}: ${text}`); } return res.json() as Promise<ChatCompletionResponse>; } async function listModels(): Promise<string[]> { const res = await fetch(`${LM_BASE_URL}/v1/models`, { headers: apiHeaders() }); if (!res.ok) throw new Error(`Failed to list models: ${res.status}`); const data = (await res.json()) as { data: Array<{ id: string }> }; return data.data.map((m) => m.id); } // ── MCP Tool definitions ───────────────────────────────────────────── const TOOLS = [ { name: 'chat', description: 'Delegate a bounded task to the local LLM (Qwen3-Coder, ~3-4 tok/s). ' + 'Best for: quick code explanation, pattern recognition, boilerplate generation, knowledge questions. ' + 'Use when you can work in parallel — fire this off and continue your own work. ' + 'RULES: (1) Always send COMPLETE code, never truncated — the local LLM WILL hallucinate details for missing code. ' + '(2) Set max_tokens to match expected output: 150 for quick answers (~45s), 300 for explanations (~100s), 500 for code generation (~170s). ' + '(3) Be explicit about output format in your message ("respond in 3 bullets", "return only the function"). ' + 'DO NOT use for: multi-step reasoning, creative writing, tasks needing >500 token output, or anything requiring tool use (use lm-taskrunner for tool-augmented tasks via mcpo).', inputSchema: { type: 'object' as const, properties: { message: { type: 'string', description: 'The task. Be specific about expected output format and length. Include COMPLETE code — never truncate.' }, system: { type: 'string', description: 'Persona for the local LLM. Be specific: "Senior TypeScript dev" not "helpful assistant". Short personas (under 30 words) get best results.' }, temperature: { type: 'number', description: '0.1 for factual/code tasks, 0.3 for analysis (default), 0.7 for creative suggestions. Stay under 0.5 for code.' }, max_tokens: { type: 'number', description: 'Cap this to match expected output. 150=quick answer, 300=explanation, 500=code generation. Lower = faster. Default 4096 is almost always too high.' }, }, required: ['message'], }, }, { name: 'custom_prompt', description: 'Structured analysis on the local LLM with explicit system/context/instruction separation. ' + 'This 3-part format gets the best results from local models — the separation prevents context bleed. ' + 'System sets persona (be specific: "Senior TypeScript dev reviewing for security bugs"). ' + 'Context provides COMPLETE data (full source file, full error log — never truncated). ' + 'Instruction states exactly what to produce (under 50 words for best results). ' + 'Best for: code review, comparison, refactoring suggestions, structured analysis. ' + 'Expect 30-180s response time depending on max_tokens.', inputSchema: { type: 'object' as const, properties: { system: { type: 'string', description: 'Persona. Be specific and under 30 words. Example: "Expert Node.js developer focused on error handling and edge cases."' }, context: { type: 'string', description: 'The COMPLETE data to analyse. Full source code, full logs, full text. NEVER truncate — the local LLM fills gaps with plausible hallucinations.' }, instruction: { type: 'string', description: 'What to produce. Under 50 words. Specify format: "List 3 bugs as bullet points" or "Return a JSON array of {line, issue, fix}".' }, temperature: { type: 'number', description: '0.1 for bugs/review, 0.3 for analysis (default), 0.5 for suggestions.' }, max_tokens: { type: 'number', description: 'Match to expected output. 200 for bullets, 400 for detailed review, 600 for code generation.' }, }, required: ['instruction'], }, }, { name: 'code_task', description: 'Purpose-built for code analysis tasks. Wraps the local LLM with an optimised code-review system prompt. ' + 'Provide COMPLETE source code and a specific task — returns analysis in ~30-180s. ' + 'Ideal for parallel execution: delegate to local LLM while you handle other work. ' + 'The local LLM excels at: explaining code, finding common bugs, suggesting improvements, comparing patterns, generating boilerplate. ' + 'It struggles with: subtle/adversarial bugs, multi-file reasoning, design tasks requiring integration. ' + 'Output capped at 500 tokens by default (override with max_tokens).', inputSchema: { type: 'object' as const, properties: { code: { type: 'string', description: 'COMPLETE source code. Never truncate. Include imports and full function bodies.' }, task: { type: 'string', description: 'What to do. Be specific and concise: "Find bugs", "Explain this function", "Add error handling to fetchData", "Compare these two approaches".' }, language: { type: 'string', description: 'Programming language for context: "typescript", "python", "rust", etc.' }, max_tokens: { type: 'number', description: 'Expected output size. Default 500. Use 200 for quick answers, 800 for code generation.' }, }, required: ['code', 'task'], }, }, { name: 'list_models', description: 'List models currently loaded in LM Studio. Use to verify which model will handle delegated tasks.', inputSchema: { type: 'object' as const, properties: {} }, }, { name: 'health_check', description: 'Check connectivity and latency to the local LM Studio instance. Run before delegating time-sensitive tasks.', inputSchema: { type: 'object' as const, properties: {} }, }, ]; // ── MCP Server ─────────────────────────────────────────────────────── const server = new Server( { name: 'houtini-lm', version: '2.1.0' }, { capabilities: { tools: {} } }, ); server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS })); server.setRequestHandler(CallToolRequestSchema, async (request) => { const { name, arguments: args } = request.params; try { switch (name) { case 'chat': { const { message, system, temperature, max_tokens } = args as { message: string; system?: string; temperature?: number; max_tokens?: number; }; const messages: ChatMessage[] = []; if (system) messages.push({ role: 'system', content: system }); messages.push({ role: 'user', content: message }); const resp = await chatCompletion(messages, { temperature, maxTokens: max_tokens, }); const reply = resp.choices[0]?.message?.content ?? ''; const usage = resp.usage ? `\n\n---\nModel: ${resp.model} | Tokens: ${resp.usage.prompt_tokens}→${resp.usage.completion_tokens}` : ''; return { content: [{ type: 'text', text: reply + usage }] }; } case 'custom_prompt': { const { system, context, instruction, temperature, max_tokens } = args as { system?: string; context?: string; instruction: string; temperature?: number; max_tokens?: number; }; const messages: ChatMessage[] = []; if (system) messages.push({ role: 'system', content: system }); let userContent = instruction; if (context) userContent = `Context:\n${context}\n\nInstruction:\n${instruction}`; messages.push({ role: 'user', content: userContent }); const resp = await chatCompletion(messages, { temperature, maxTokens: max_tokens, }); return { content: [{ type: 'text', text: resp.choices[0]?.message?.content ?? '' }], }; } case 'code_task': { const { code, task, language, max_tokens: codeMaxTokens } = args as { code: string; task: string; language?: string; max_tokens?: number; }; const lang = language || 'unknown'; const codeMessages: ChatMessage[] = [ { role: 'system', content: `Expert ${lang} developer. Analyse the provided code and complete the task. Be specific — reference line numbers, function names, and concrete fixes. No preamble.`, }, { role: 'user', content: `Task: ${task}\n\n\`\`\`${lang}\n${code}\n\`\`\``, }, ]; const codeResp = await chatCompletion(codeMessages, { temperature: 0.2, maxTokens: codeMaxTokens ?? 500, }); const codeReply = codeResp.choices[0]?.message?.content ?? ''; const codeUsage = codeResp.usage ? `\n\n---\nModel: ${codeResp.model} | Tokens: ${codeResp.usage.prompt_tokens}→${codeResp.usage.completion_tokens} | ${lang}` : ''; return { content: [{ type: 'text', text: codeReply + codeUsage }] }; } case 'list_models': { const models = await listModels(); return { content: [ { type: 'text', text: models.length ? `Loaded models:\n${models.map((m) => ` • ${m}`).join('\n')}` : 'No models currently loaded.', }, ], }; } case 'health_check': { const start = Date.now(); const models = await listModels(); const ms = Date.now() - start; return { content: [ { type: 'text', text: `Connected to ${LM_BASE_URL} (${ms}ms)\nAuth: ${LM_PASSWORD ? 'enabled' : 'none'}\nModels loaded: ${models.length}${models.length ? '\n' + models.join(', ') : ''}`, }, ], }; } default: throw new Error(`Unknown tool: ${name}`); } } catch (error) { return { content: [{ type: 'text', text: `Error: ${error instanceof Error ? error.message : String(error)}` }], isError: true, }; } }); async function main() { const transport = new StdioServerTransport(); await server.connect(transport); process.stderr.write(`Houtini LM server running (${LM_BASE_URL})\n`); } main().catch((error) => { process.stderr.write(`Fatal error: ${error}\n`); process.exit(1); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/houtini-ai/lm'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•12.4 KiB