Nexus MCP for Obsidian

Overview Schema Related Servers Score Discussions

WebLLMAdapter.ts•22.8 KiB

/** * WebLLMAdapter * * Adapter for running LLMs locally via WebGPU using MLC.ai's WebLLM. * Provides fully offline inference after initial model download. * * Features: * - WebGPU-accelerated inference * - Streaming responses * - Tool calling via [TOOL_CALLS] format * - No external API required * * Note: Uses main-thread execution instead of Web Workers because * Obsidian's sandboxed Electron environment blocks CDN imports in workers. * WebGPU handles GPU compute, so main thread execution doesn't block UI. * * ╔═══════════════════════════════════════════════════════════════════════════╗ * ║ ⚠️ KNOWN LIMITATION: TOOL CONTINUATIONS DISABLED (Dec 2025) ║ * ╠═══════════════════════════════════════════════════════════════════════════╣ * ║ Tool calling works for the FIRST generation only. Multi-turn tool ║ * ║ continuations (ping-pong pattern) cause a hard Electron/WebGPU crash. ║ * ║ ║ * ║ SYMPTOMS: ║ * ║ - First generation completes successfully with tool call ║ * ║ - Tool execution works fine ║ * ║ - Second generation (continuation) crashes Obsidian renderer process ║ * ║ - Crash happens during prefill phase of stream iteration ║ * ║ - No JavaScript error is caught - it's a hard renderer crash ║ * ║ ║ * ║ INVESTIGATION DONE (Dec 6, 2025): ║ * ║ 1. Generation lock mechanism - prevents concurrent GPU ops ║ * ║ 2. KV cache reset timing - before/after/skip - all crash ║ * ║ 3. Non-streaming API for continuations - also crashes ║ * ║ 4. Longer delays (1s+) between generations - still crashes ║ * ║ 5. Skipping ALL resets - crashes during prefill ║ * ║ ║ * ║ LIKELY CAUSE: ║ * ║ WebGPU resource management issue in WebLLM on Apple Silicon. ║ * ║ The second prefill operation corrupts GPU memory or hits an ║ * ║ unhandled edge case in the WebGPU -> Metal translation layer. ║ * ║ See: https://github.com/mlc-ai/web-llm/issues/647 ║ * ║ ║ * ║ WORKAROUND: ║ * ║ Tool continuations are blocked with a user-friendly error message. ║ * ║ Users should use Ollama or LM Studio for tool-calling workflows. ║ * ║ ║ * ║ TO RE-ENABLE: ║ * ║ 1. Update to newer WebLLM version when available ║ * ║ 2. Remove the isToolContinuation check in generateStreamAsync() ║ * ║ 3. Test thoroughly on multiple macOS/GPU configurations ║ * ╚═══════════════════════════════════════════════════════════════════════════╝ */ import { Vault } from 'obsidian'; import { BaseAdapter } from '../BaseAdapter'; import { GenerateOptions, StreamChunk, LLMResponse, ModelInfo, ProviderCapabilities, ModelPricing, TokenUsage, LLMProviderError, } from '../types'; import { ToolCallContentParser } from '../shared/ToolCallContentParser'; import { WebLLMEngine, GenerationResult } from './WebLLMEngine'; import { WebLLMModelManager } from './WebLLMModelManager'; import { WebLLMVRAMDetector } from './WebLLMVRAMDetector'; import { NexusToolCallConverter } from './NexusToolCallConverter'; import { WebLLMModelSpec, WebLLMState, WebLLMStatus, WebLLMError, ChatMessage, } from './types'; import { WEBLLM_MODELS, getWebLLMModel, getModelsForVRAM } from './WebLLMModels'; // Unique instance counter for debugging adapter recreation issues let webllmAdapterInstanceCount = 0; export class WebLLMAdapter extends BaseAdapter { readonly name = 'webllm'; readonly baseUrl = ''; // Local model - no external URL private engine: WebLLMEngine; private modelManager: WebLLMModelManager; private state: WebLLMState; private vault: Vault; private instanceId: number; private toolCallConverter: NexusToolCallConverter; mcpConnector?: any; // For tool execution support constructor(vault: Vault, mcpConnector?: any, sessionId?: string, workspaceId?: string) { // WebLLM doesn't need an API key super('', '', '', false); this.instanceId = ++webllmAdapterInstanceCount; this.vault = vault; this.mcpConnector = mcpConnector; // Use shared singleton engine - critical for multiple adapter instances // This ensures the GPU-loaded model is shared across all adapters this.engine = WebLLMEngine.getSharedInstance(); this.modelManager = new WebLLMModelManager(vault); // Initialize tool call converter for two-tool architecture // Nexus models are trained on the full toolset - convert to useTool format this.toolCallConverter = new NexusToolCallConverter(sessionId, workspaceId); this.state = { status: 'unavailable', loadedModel: null, }; this.initializeCache(); } /** * Update session/workspace context for tool call conversion */ updateToolContext(sessionId?: string, workspaceId?: string): void { this.toolCallConverter.updateContext(sessionId, workspaceId); } // ============================================================================ // Initialization // ============================================================================ /** * Initialize the WebLLM adapter * Checks WebGPU availability */ async initialize(): Promise<void> { // Check WebGPU availability const vramInfo = await WebLLMVRAMDetector.detect(); this.state.vramInfo = vramInfo; if (!vramInfo.webGPUSupported) { this.state.status = 'unavailable'; return; } this.state.status = 'available'; } /** * Load a model into GPU memory */ async loadModel( modelSpec: WebLLMModelSpec, onProgress?: (progress: number, stage: string) => void ): Promise<void> { if (this.state.status === 'unavailable') { throw new WebLLMError('WebGPU not available', 'WEBGPU_NOT_SUPPORTED'); } this.state.status = 'loading'; try { // Initialize model via main-thread engine const result = await this.engine.initModel(modelSpec, { onProgress: (progress) => { this.state.loadProgress = progress.progress; if (onProgress) { onProgress(progress.progress, progress.stage); } }, }); this.state.status = 'ready'; this.state.loadedModel = modelSpec.id; this.currentModel = modelSpec.apiName; } catch (error) { this.state.status = 'error'; this.state.error = error instanceof Error ? error.message : 'Unknown error'; throw error; } } /** * Unload the current model from GPU memory */ async unloadModel(): Promise<void> { if (this.state.loadedModel) { await this.engine.unloadModel(); this.state.status = 'available'; this.state.loadedModel = null; this.currentModel = ''; } } // ============================================================================ // Generation (BaseAdapter implementation) // ============================================================================ /** * Generate response without caching */ async generateUncached(prompt: string, options?: GenerateOptions): Promise<LLMResponse> { await this.ensureModelLoadedAsync(); const messages = this.buildMessages(prompt, options?.systemPrompt); try { this.state.status = 'generating'; const result = await this.engine.generate(messages, { temperature: options?.temperature, maxTokens: options?.maxTokens, topP: options?.topP, stopSequences: options?.stopSequences, }); this.state.status = 'ready'; let content = result.content; let toolCalls: any[] = []; // Check for [TOOL_CALLS] or <tool_call> format if (ToolCallContentParser.hasToolCallsFormat(content)) { const parsed = ToolCallContentParser.parse(content); if (parsed.hasToolCalls) { content = parsed.cleanContent; // Convert old-style tool calls to useTool format // Nexus models are trained on the full toolset - wrap in useTool toolCalls = this.toolCallConverter.convertToolCalls(parsed.toolCalls); } } const usage: TokenUsage = { promptTokens: result.usage.promptTokens, completionTokens: result.usage.completionTokens, totalTokens: result.usage.totalTokens, }; return await this.buildLLMResponse( content, this.currentModel, usage, { cached: false }, toolCalls.length > 0 ? 'tool_calls' : this.mapFinishReason(result.finishReason), toolCalls ); } catch (error) { this.state.status = 'ready'; throw this.handleError(error, 'generation'); } } /** * Generate streaming response */ async* generateStreamAsync( prompt: string, options?: GenerateOptions ): AsyncGenerator<StreamChunk, void, unknown> { await this.ensureModelLoadedAsync(); // Check for pre-built conversation history (tool continuations) let messages: ChatMessage[]; if (options?.conversationHistory && options.conversationHistory.length > 0) { messages = options.conversationHistory; } else { messages = this.buildMessages(prompt, options?.systemPrompt); } // CRITICAL: Reset adapter state to 'ready' before starting new generation // This ensures clean state regardless of previous generation's outcome // The engine handles the actual locking via generationLock if (this.state.status === 'generating') { this.state.status = 'ready'; } const isToolContinuation = !!(options?.conversationHistory?.length); this.state.status = 'generating'; try { let accumulatedContent = ''; let hasToolCallsFormat = false; let finalUsage: TokenUsage | undefined; let chunkCount = 0; for await (const response of this.engine.generateStream(messages, { temperature: options?.temperature, maxTokens: options?.maxTokens, topP: options?.topP, stopSequences: options?.stopSequences, isToolContinuation, // Pass flag to skip resetChat on continuations })) { // Check if this is a chunk or final result if ('tokenCount' in response && !('usage' in response)) { // This is a StreamChunk from the engine const chunk = response; chunkCount++; accumulatedContent += chunk.content; // Check for [TOOL_CALLS] format early in stream if (!hasToolCallsFormat && ToolCallContentParser.hasToolCallsFormat(accumulatedContent)) { hasToolCallsFormat = true; } // If [TOOL_CALLS] detected, buffer chunks (don't show raw JSON to user) if (!hasToolCallsFormat) { yield { content: chunk.content, complete: false, }; } } else if ('usage' in response) { // This is a GenerationResult (final) const complete = response as GenerationResult; finalUsage = { promptTokens: complete.usage.promptTokens, completionTokens: complete.usage.completionTokens, totalTokens: complete.usage.totalTokens, }; // Handle [TOOL_CALLS] or <tool_call> format at completion if (hasToolCallsFormat) { const parsed = ToolCallContentParser.parse(accumulatedContent); if (parsed.hasToolCalls) { // Convert old-style tool calls to useTool format // Nexus models are trained on the full toolset - wrap in useTool const convertedToolCalls = this.toolCallConverter.convertToolCalls(parsed.toolCalls); yield { content: parsed.cleanContent, complete: true, toolCalls: convertedToolCalls, toolCallsReady: true, usage: finalUsage, }; } else { // Parsing failed - yield raw content yield { content: accumulatedContent, complete: true, usage: finalUsage, }; } } else { yield { content: '', complete: true, usage: finalUsage, }; } } } } catch (error) { if (error instanceof WebLLMError) { throw error; } throw new LLMProviderError( `WebLLM streaming failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 'webllm', 'GENERATION_FAILED' ); } finally { // CRITICAL: Always reset adapter status in finally block // This ensures clean state even if the generator is abandoned (not fully consumed) this.state.status = 'ready'; } } // ============================================================================ // Model Information // ============================================================================ /** * List available models (based on VRAM) */ async listModels(): Promise<ModelInfo[]> { const vramInfo = this.state.vramInfo || await WebLLMVRAMDetector.detect(); const availableModels = getModelsForVRAM(vramInfo.estimatedVRAM); return availableModels.map(model => ({ id: model.id, name: model.name, contextWindow: model.contextWindow, maxOutputTokens: model.maxTokens, supportsJSON: model.capabilities.supportsJSON, supportsImages: model.capabilities.supportsImages, supportsFunctions: model.capabilities.supportsFunctions, supportsStreaming: model.capabilities.supportsStreaming, supportsThinking: model.capabilities.supportsThinking, pricing: { inputPerMillion: 0, // Free - local outputPerMillion: 0, currency: 'USD', lastUpdated: new Date().toISOString(), }, })); } /** * Get adapter capabilities */ getCapabilities(): ProviderCapabilities { return { supportsStreaming: true, supportsJSON: true, supportsImages: false, supportsFunctions: true, // Via [TOOL_CALLS] format supportsThinking: false, maxContextWindow: 4096, // Must match WASM library (ctx4k) supportedFeatures: ['streaming', 'function_calling', 'local', 'privacy', 'offline'], }; } /** * Get model pricing (always free for local models) */ async getModelPricing(modelId: string): Promise<ModelPricing | null> { return { rateInputPerMillion: 0, rateOutputPerMillion: 0, currency: 'USD', }; } /** * Check if adapter is available * Performs lazy initialization if not yet initialized */ async isAvailable(): Promise<boolean> { // Lazy initialization: if adapter wasn't initialized during startup, // initialize now to check WebGPU availability if (this.state.status === 'unavailable' && !this.state.vramInfo) { await this.initialize(); } return this.state.status !== 'unavailable'; } // ============================================================================ // State & Status // ============================================================================ /** * Get current adapter state */ getState(): WebLLMState { return { ...this.state }; } /** * Get current status */ getStatus(): WebLLMStatus { return this.state.status; } /** * Check if model is loaded */ isModelLoaded(): boolean { return this.state.loadedModel !== null; } /** * Check if a model uses [TOOL_CALLS] content format * All Nexus fine-tuned models use this format (legacy identifiers included for compatibility) */ static usesToolCallsContentFormat(modelId: string): boolean { const contentFormatKeywords = ['nexus', 'tools-sft', 'claudesidian']; const lowerModelId = modelId.toLowerCase(); return contentFormatKeywords.some(keyword => lowerModelId.includes(keyword)); } // ============================================================================ // Model Management Delegation // ============================================================================ /** * Get model manager for download/install operations */ getModelManager(): WebLLMModelManager { return this.modelManager; } /** * Get VRAM info */ getVRAMInfo() { return this.state.vramInfo; } // ============================================================================ // Helper Methods // ============================================================================ /** * Ensure a model is loaded before generation * Will auto-load the default model if not already loaded */ private async ensureModelLoadedAsync(): Promise<void> { const engineLoaded = this.engine?.isModelLoaded(); // Lazy initialization: if adapter wasn't initialized during startup, // initialize now on first use. This prevents blocking vault startup. if (this.state.status === 'unavailable' && !this.state.vramInfo) { console.log('[WebLLMAdapter] Lazy initialization on first use...'); await this.initialize(); } if (this.state.status === 'unavailable') { throw new LLMProviderError( 'WebGPU not available', 'webllm', 'WEBGPU_NOT_SUPPORTED' ); } // If engine has model loaded, we're good - trust the shared engine state // This handles: tool continuation, multiple adapter instances, etc. if (engineLoaded) { // Sync adapter state with engine state const engineModelId = this.engine.getCurrentModelId(); if (engineModelId && !this.state.loadedModel) { this.state.loadedModel = engineModelId; this.state.status = 'ready'; } return; } // Also check if status is ready (normal case) if (this.state.loadedModel && this.state.status === 'ready') { return; } // If currently loading, wait if (this.state.status === 'loading') { // Wait for loading to complete (poll every 500ms, max 60s) for (let i = 0; i < 120; i++) { await new Promise(resolve => setTimeout(resolve, 500)); // Use type assertion to break TypeScript's type narrowing (status can change async) const currentStatus = this.state.status as WebLLMStatus; if (currentStatus === 'ready') return; if (currentStatus === 'error') throw new LLMProviderError( this.state.error || 'Model loading failed', 'webllm', 'MODEL_LOAD_FAILED' ); } throw new LLMProviderError('Model loading timeout', 'webllm', 'MODEL_LOAD_TIMEOUT'); } // No model loaded - try to auto-load the default model // Get the default/first available model const modelSpec = WEBLLM_MODELS[0]; if (!modelSpec) { throw new LLMProviderError( 'No WebLLM models available', 'webllm', 'NO_MODELS_AVAILABLE' ); } // WebLLM handles its own model caching via browser Cache API / IndexedDB // No need to check if model is "installed" locally - just load it // First load will download from HuggingFace, subsequent loads use cache await this.loadModel(modelSpec); } /** * Sync version for compatibility (throws if not loaded) */ private ensureModelLoaded(): void { if (!this.state.loadedModel) { throw new LLMProviderError( 'No model loaded. Model will be auto-loaded on first generation.', 'webllm', 'MODEL_NOT_LOADED' ); } if (this.state.status === 'unavailable') { throw new LLMProviderError( 'WebGPU not available', 'webllm', 'WEBGPU_NOT_SUPPORTED' ); } } /** * Build chat messages from prompt and system prompt */ protected buildMessages(prompt: string, systemPrompt?: string): ChatMessage[] { const messages: ChatMessage[] = []; if (systemPrompt) { messages.push({ role: 'system', content: systemPrompt }); } messages.push({ role: 'user', content: prompt }); return messages; } /** * Map WebLLM finish reason to standard type */ private mapFinishReason(reason: string): 'stop' | 'length' | 'tool_calls' | 'content_filter' { switch (reason) { case 'stop': return 'stop'; case 'length': return 'length'; case 'abort': return 'stop'; default: return 'stop'; } } /** * Handle and normalize errors */ protected handleError(error: any, operation: string): never { if (error instanceof LLMProviderError) { throw error; } if (error instanceof WebLLMError) { throw new LLMProviderError( error.message, 'webllm', error.code, error ); } throw new LLMProviderError( `WebLLM ${operation} failed: ${error instanceof Error ? error.message : 'Unknown error'}`, 'webllm', 'UNKNOWN_ERROR', error ); } // ============================================================================ // Cleanup // ============================================================================ /** * Clean up resources */ async dispose(): Promise<void> { await this.engine.dispose(); this.state.status = 'unavailable'; this.state.loadedModel = null; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ProfSynapse/nexus'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

WebLLMAdapter.ts•22.8 KiB