MCP Web Docs

index.ts•69.5 KiB

#!/usr/bin/env node // IMPORTANT: Suppress ALL stdout logging for MCP compatibility // MCP servers must only output JSON-RPC messages to stdout // Set environment variables to suppress Crawlee/Apify logging process.env.CRAWLEE_LOG_LEVEL = 'OFF'; process.env.APIFY_LOG_LEVEL = 'OFF'; // Import and suppress Crawlee logging import { log, Configuration } from 'crawlee'; log.setLevel(log.LEVELS.OFF); // Configure Crawlee to be silent Configuration.getGlobalConfig().set('logLevel', 'OFF'); import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { CallToolRequestSchema, ErrorCode, ListToolsRequestSchema, McpError } from '@modelcontextprotocol/sdk/types.js'; import { DocumentStore } from './storage/storage.js'; import { FastEmbeddings } from './embeddings/fastembed.js'; import { WebDocumentProcessor } from './processor/processor.js'; import { IndexingStatusTracker } from './indexing/status.js'; import { IndexingQueueManager } from './indexing/queue-manager.js'; import { DocsConfig, loadConfig, isValidPublicUrl, normalizeUrl } from './config.js'; import { DocsCrawler } from './crawler/docs-crawler.js'; import { AuthManager } from './crawler/auth.js'; import { fetchFavicon } from './util/favicon.js'; import { DocumentChunk, IndexingStatus } from './types.js'; import { generateDocId } from './util/docs.js'; import { logger } from './util/logger.js'; import { StorageStateSchema, safeJsonParse, validateToolArgs, sanitizeErrorMessage, detectPromptInjection, wrapExternalContent, addInjectionWarnings, SessionExpiredError, AddDocumentationArgsSchema, AuthenticateArgsSchema, ClearAuthArgsSchema, SearchDocumentationArgsSchema, ReindexDocumentationArgsSchema, DeleteDocumentationArgsSchema, SetTagsArgsSchema, CreateCollectionArgsSchema, DeleteCollectionArgsSchema, UpdateCollectionArgsSchema, GetCollectionArgsSchema, AddToCollectionArgsSchema, RemoveFromCollectionArgsSchema, SearchCollectionArgsSchema, type ValidatedStorageState, } from './util/security.js'; import type { StorageState } from './crawler/crawlee-crawler.js'; /** Progress token type from MCP spec */ type ProgressToken = string | number; class WebDocsServer { private server: McpServer; private config!: DocsConfig; private store!: DocumentStore; private processor!: WebDocumentProcessor; private statusTracker: IndexingStatusTracker; private indexingQueue: IndexingQueueManager; private authManager!: AuthManager; /** Maps operation ID to progress token for MCP notifications */ private progressTokens: Map<string, ProgressToken> = new Map(); /** Tracks last notified progress to throttle notifications */ private lastNotifiedProgress: Map<string, number> = new Map(); constructor() { // Initialize basic components that don't need async initialization this.statusTracker = new IndexingStatusTracker(); this.indexingQueue = new IndexingQueueManager(); // Set up status change listener for MCP progress notifications this.statusTracker.addStatusListener((status) => { this.sendProgressNotification(status); }); // Initialize MCP server this.server = new McpServer( { name: 'mcp-web-docs', version: '1.0.0', }, { capabilities: { tools: {}, }, } ); // Set up tool handlers this.setupToolHandlers(); // Handle errors this.server.server.onerror = (error: Error) => logger.error('[MCP Error]', error); } /** * Send MCP progress notification to client. * Only sends if the client provided a progressToken in the original request. * Throttled to avoid flooding - sends on 5% increments or status changes. */ private async sendProgressNotification(status: IndexingStatus): Promise<void> { const progressToken = this.progressTokens.get(status.id); // Only send if we have a progress token from the client if (!progressToken) { logger.debug(`[Progress] No token for ${status.id}, skipping notification`); return; } const progressPercent = Math.round(status.progress * 100); const lastProgress = this.lastNotifiedProgress.get(status.id) ?? -1; // Only notify on significant progress (5% increments) or status changes const isStatusChange = status.status === 'complete' || status.status === 'failed' || status.status === 'cancelled'; const isSignificantProgress = progressPercent - lastProgress >= 5; if (!isStatusChange && !isSignificantProgress) { return; } this.lastNotifiedProgress.set(status.id, progressPercent); // Build human-readable message let message = status.description; if (status.pagesProcessed !== undefined && status.pagesFound !== undefined) { message = `${status.description} (${status.pagesProcessed}/${status.pagesFound} pages)`; } try { // Send MCP progress notification per spec: // https://modelcontextprotocol.io/specification/2025-03-26/basic/utilities/progress await this.server.server.notification({ method: 'notifications/progress', params: { progressToken, progress: progressPercent, total: 100, message, }, }); logger.info(`[Progress] Sent notification: ${progressPercent}% - ${message}`); } catch (error) { logger.debug(`[Progress] Failed to send notification:`, error); } // Clean up tracking for completed operations if (isStatusChange) { this.lastNotifiedProgress.delete(status.id); this.progressTokens.delete(status.id); } } private async initialize() { // Load configuration this.config = await loadConfig(); // Initialize components that need config const embeddings = new FastEmbeddings(); this.store = new DocumentStore(this.config.dbPath, this.config.vectorDbPath, embeddings, this.config.cacheSize); this.processor = new WebDocumentProcessor(embeddings, this.config.maxChunkSize); // Initialize auth manager for handling authenticated crawls this.authManager = new AuthManager(this.config.dataDir); await this.authManager.initialize(); // Initialize storage await this.store.initialize(); } private setupToolHandlers(): void { // List available tools this.server.server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: 'add_documentation', description: `Add new documentation site for indexing. Supports authenticated sites via the auth options. IMPORTANT: Before calling this tool, ask the user if they want to restrict crawling to a specific path prefix. For example, if indexing https://docs.example.com/api/v2/overview, the user might want to restrict to '/api/v2' to avoid crawling unrelated sections of the site. VERSIONING: If the user is indexing documentation for a versioned software package/library (e.g., React, Vue, Python, a database, an SDK), ask what version they want to associate with this documentation. Many packages have multiple versions with different APIs. Do NOT ask about versioning for: - Internal company documentation (wikis, best practices, runbooks) - Single-version products or services - Documentation the user indicates should always reflect "latest" Examples where version matters: "React 18", "Python 3.11", "PostgreSQL 15", "Next.js 14" Examples where version doesn't matter: "Company engineering handbook", "AWS console docs", "Confluence spaces"`, inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the documentation site', }, title: { type: 'string', description: 'Optional title for the documentation', }, id: { type: 'string', description: 'Optional custom ID for the documentation (used for storage and identification). If not provided, an ID is auto-generated from the URL.', }, pathPrefix: { type: 'string', description: "Optional path prefix to restrict crawling. Only pages whose URL path starts with this prefix will be indexed. Must start with '/'. Example: '/api/v2' would only crawl pages under that path.", }, tags: { type: 'array', items: { type: 'string' }, description: 'Optional tags to categorize the documentation (e.g., ["frontend", "mycompany"]). Tags help filter search results across multiple documentation sites.', }, version: { type: 'string', description: 'Optional version identifier for versioned package documentation (e.g., "18", "v6.4", "3.11", "latest"). Helps distinguish between multiple versions of the same package.', }, auth: { type: 'object', description: 'Authentication options for protected documentation sites', properties: { requiresAuth: { type: 'boolean', description: 'Set to true to open a browser for interactive login before crawling', }, browser: { type: 'string', enum: ['chromium', 'chrome', 'firefox', 'webkit', 'edge'], description: "Optional. If omitted, the user's default browser is automatically detected from OS settings. Only specify to override.", }, loginUrl: { type: 'string', description: 'Login page URL if different from main URL', }, loginSuccessPattern: { type: 'string', description: 'URL regex pattern that indicates successful login', }, loginSuccessSelector: { type: 'string', description: 'CSS selector that appears after successful login', }, loginTimeoutSecs: { type: 'number', description: 'Timeout for login in seconds (default: 300)', }, }, }, }, required: ['url'], }, }, { name: 'authenticate', description: "Open a browser window for interactive login to a protected site. The session will be saved and reused for future crawls. Use this before add_documentation for sites that require login. The user's default browser is automatically detected from OS settings - do NOT specify a browser unless the user explicitly requests a specific one.", inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the site to authenticate to', }, browser: { type: 'string', enum: ['chromium', 'chrome', 'firefox', 'webkit', 'edge'], description: "Optional. If omitted, the user's default browser is automatically detected from OS settings. Only specify this to override auto-detection with a specific browser.", }, loginUrl: { type: 'string', description: 'Login page URL if different from main URL', }, loginTimeoutSecs: { type: 'number', description: 'Timeout for login in seconds (default: 300 = 5 minutes)', }, }, required: ['url'], }, }, { name: 'clear_auth', description: 'Clear saved authentication session for a domain', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the site to clear authentication for', }, }, required: ['url'], }, }, { name: 'list_documentation', description: 'List all indexed documentation sites with their metadata including tags. Use this to see what documentation is available and what tags are assigned to each site. Each doc shows: url, title, tags[], lastIndexed, requiresAuth.', inputSchema: { type: 'object', properties: {}, }, }, { name: 'search_documentation', description: `Search through indexed documentation using hybrid search (full-text + semantic). ## Query Tips for Best Results 1. **Be specific** - Include unique terms from what you're looking for - Instead of: "Button props" - Try: "Button props onClick disabled loading" 2. **Use exact phrases** - Wrap in quotes for exact matching - "authentication middleware" finds that exact phrase - authentication middleware finds pages with either word 3. **Include context** - Add related terms to narrow results - API docs: "GET /users endpoint authentication headers" - Config: "webpack config entry output plugins" - Functions: "parseJSON function parameters return type" 4. **Combine concepts** - More terms = more precise results - "Card component status primary negative props table" - "database connection pool maxConnections timeout" ## Filtering Options - **url**: Filter to a specific documentation site by URL - **tags**: Filter to docs with specific tags. Use when user mentions a category, project, or team name (e.g., tags: ["frontend", "jimdo"] to search only frontend Jimdo docs) ## How Search Works - Full-text search with stemming (run → runs, running) - Fuzzy matching for typos (authetication → authentication) - Semantic similarity for conceptual matches - Results ranked by relevance combining all signals`, inputSchema: { type: 'object', properties: { query: { type: 'string', description: 'Search query - be specific and include unique terms. Use quotes for exact phrases. Example: "Card component props headline status" or "REST API authentication Bearer token"', }, url: { type: 'string', description: 'Optional: Filter results to a specific documentation site by its URL. If not provided, searches all indexed docs.', }, limit: { type: 'number', description: 'Maximum number of results (default: 10)', }, tags: { type: 'array', items: { type: 'string' }, description: 'Optional: Filter to docs with ALL specified tags. Use when user mentions a category, project, or team (e.g., ["frontend", "mycompany"]). See list_tags for available tags.', }, }, required: ['query'], }, }, { name: 'reindex_documentation', description: 'Re-index a specific documentation site', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the documentation to re-index', }, }, required: ['url'], }, }, { name: 'get_indexing_status', description: 'Get current indexing status', inputSchema: { type: 'object', properties: {}, }, }, { name: 'delete_documentation', description: 'Delete an indexed documentation site and all its data (vectors, metadata, cached crawl data, and optionally auth session)', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the documentation site to delete', }, clearAuth: { type: 'boolean', description: 'Also clear saved authentication session for this domain (default: false)', }, }, required: ['url'], }, }, { name: 'set_tags', description: 'Set tags for a documentation site to enable tag-based filtering in searches. Tags categorize docs by project, team, or type (e.g., "frontend", "backend", "mycompany", "jimdo"). Replaces any existing tags. Use an empty array to remove all tags.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the documentation site', }, tags: { type: 'array', items: { type: 'string' }, description: 'Array of tags to assign. Tags are case-insensitive and must contain only alphanumeric characters, hyphens, or underscores. Example: ["frontend", "mycompany", "react"]', }, }, required: ['url', 'tags'], }, }, { name: 'list_tags', description: 'List all available tags with usage counts. Use this to discover what tags exist when you need to filter searches but are unsure of the exact tag names. Returns tags sorted by usage count.', inputSchema: { type: 'object', properties: {}, }, }, // ============ Collection Tools ============ { name: 'create_collection', description: 'Create a new collection to group related documentation sites. Collections help organize docs by project or context (e.g., "My React Project" with React + Next.js + TypeScript docs).', inputSchema: { type: 'object', properties: { name: { type: 'string', description: 'Unique name for the collection (e.g., "My React Project", "Backend APIs")', }, description: { type: 'string', description: 'Optional description of what this collection contains', }, }, required: ['name'], }, }, { name: 'delete_collection', description: 'Delete a collection. The documentation sites in the collection are NOT deleted, only the collection grouping.', inputSchema: { type: 'object', properties: { name: { type: 'string', description: 'Name of the collection to delete', }, }, required: ['name'], }, }, { name: 'update_collection', description: "Update a collection's name or description.", inputSchema: { type: 'object', properties: { name: { type: 'string', description: 'Current name of the collection', }, newName: { type: 'string', description: 'Optional new name for the collection', }, description: { type: 'string', description: 'Optional new description for the collection', }, }, required: ['name'], }, }, { name: 'list_collections', description: 'List all collections with their document counts. Use this to see available collections for context switching.', inputSchema: { type: 'object', properties: {}, }, }, { name: 'get_collection', description: 'Get details of a specific collection including all its documentation sites.', inputSchema: { type: 'object', properties: { name: { type: 'string', description: 'Name of the collection', }, }, required: ['name'], }, }, { name: 'add_to_collection', description: 'Add one or more documentation sites to a collection. Sites must already be indexed.', inputSchema: { type: 'object', properties: { name: { type: 'string', description: 'Name of the collection', }, urls: { type: 'array', items: { type: 'string' }, description: 'URLs of indexed documentation sites to add (max 50)', }, }, required: ['name', 'urls'], }, }, { name: 'remove_from_collection', description: 'Remove one or more documentation sites from a collection. The sites remain indexed, just removed from the collection.', inputSchema: { type: 'object', properties: { name: { type: 'string', description: 'Name of the collection', }, urls: { type: 'array', items: { type: 'string' }, description: 'URLs of documentation sites to remove from the collection', }, }, required: ['name', 'urls'], }, }, { name: 'search_collection', description: 'Search for documentation within a specific collection. This is useful for focused searches within a project context. Uses the same hybrid search (full-text + semantic) as search_documentation.', inputSchema: { type: 'object', properties: { name: { type: 'string', description: 'Name of the collection to search in', }, query: { type: 'string', description: 'Search query - be specific and include unique terms', }, limit: { type: 'number', description: 'Maximum number of results (default: 10)', }, }, required: ['name', 'query'], }, }, ], })); // Handle tool calls this.server.server.setRequestHandler( CallToolRequestSchema, async (request: { params: { name: string; arguments?: Record<string, unknown> } }) => { // Extract progressToken from request metadata (per MCP spec) // Clients can include this to receive progress notifications const args = request.params.arguments as Record<string, unknown> | undefined; const progressToken = (args?._meta as Record<string, unknown> | undefined)?.progressToken as ProgressToken | undefined; switch (request.params.name) { case 'add_documentation': return this.handleAddDocumentation(request.params.arguments, progressToken); case 'list_documentation': return this.handleListDocumentation(); case 'search_documentation': return this.handleSearchDocumentation(request.params.arguments); case 'reindex_documentation': return this.handleReindexDocumentation(request.params.arguments, progressToken); case 'get_indexing_status': return this.handleGetIndexingStatus(); case 'authenticate': return this.handleAuthenticate(request.params.arguments); case 'clear_auth': return this.handleClearAuth(request.params.arguments); case 'delete_documentation': return this.handleDeleteDocumentation(request.params.arguments); case 'set_tags': return this.handleSetTags(request.params.arguments); case 'list_tags': return this.handleListTags(); // Collection handlers case 'create_collection': return this.handleCreateCollection(request.params.arguments); case 'delete_collection': return this.handleDeleteCollection(request.params.arguments); case 'update_collection': return this.handleUpdateCollection(request.params.arguments); case 'list_collections': return this.handleListCollections(); case 'get_collection': return this.handleGetCollection(request.params.arguments); case 'add_to_collection': return this.handleAddToCollection(request.params.arguments); case 'remove_from_collection': return this.handleRemoveFromCollection(request.params.arguments); case 'search_collection': return this.handleSearchCollection(request.params.arguments); default: throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}`); } } ); } private async handleAddDocumentation(args: Record<string, unknown> | undefined, progressToken?: ProgressToken) { // Validate arguments with schema let validatedArgs; try { validatedArgs = validateToolArgs(args, AddDocumentationArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { url, title, id, pathPrefix, tags, version, auth: authOptions } = validatedArgs; // Additional SSRF protection check if (!isValidPublicUrl(url)) { throw new McpError(ErrorCode.InvalidParams, 'Access to private networks is blocked'); } const normalizedUrl = normalizeUrl(url); const docTitle = title || new URL(normalizedUrl).hostname; // Use custom ID if provided, otherwise auto-generate const docId = id || generateDocId(normalizedUrl, docTitle); // Log path prefix if provided if (pathPrefix) { logger.info(`[WebDocsServer] Path prefix restriction: ${pathPrefix}`); } if (authOptions?.requiresAuth) { const hasExistingSession = await this.authManager.hasSession(normalizedUrl); if (!hasExistingSession) { logger.info(`[WebDocsServer] auth.requiresAuth=true, starting interactive login for ${normalizedUrl}`); try { await this.authManager.performInteractiveLogin(normalizedUrl, { browser: authOptions.browser, loginUrl: authOptions.loginUrl, loginSuccessPattern: authOptions.loginSuccessPattern, loginSuccessSelector: authOptions.loginSuccessSelector, loginTimeoutSecs: authOptions.loginTimeoutSecs, }); logger.info(`[WebDocsServer] Authentication successful for ${normalizedUrl}`); } catch (error) { throw new McpError( ErrorCode.InternalError, `Authentication failed: ${sanitizeErrorMessage(error)}. Please try using the 'authenticate' tool separately.` ); } } else { // Validate that the existing session is still valid before crawling logger.info(`[WebDocsServer] Validating existing session for ${normalizedUrl}...`); const validation = await this.authManager.validateSession(normalizedUrl); if (!validation.isValid) { logger.warn(`[WebDocsServer] Session expired for ${normalizedUrl}: ${validation.reason}`); // Clear the expired session await this.authManager.clearSession(normalizedUrl); throw new McpError( ErrorCode.InvalidParams, `Authentication session has expired (${validation.reason}). Please use the 'authenticate' tool to log in again.` ); } logger.info(`[WebDocsServer] ✓ Session validated for ${normalizedUrl}`); } } // Prepare auth info to store with the document // If auth was explicitly requested OR if there's an existing session for this URL, // mark the document as requiring auth (handles case where user called authenticate separately) const hasExistingSession = await this.authManager.hasSession(normalizedUrl); const requiresAuth = authOptions?.requiresAuth || hasExistingSession; const authInfo = requiresAuth ? { requiresAuth: true, authDomain: new URL(normalizedUrl).hostname, // Session is stored under target URL's domain } : undefined; if (hasExistingSession && !authOptions?.requiresAuth) { logger.info(`[WebDocsServer] Found existing auth session for ${normalizedUrl}, marking document as requiring auth`); } // Store progress token if provided by client if (progressToken !== undefined) { this.progressTokens.set(docId, progressToken); logger.info(`[Progress] Registered token for ${docId}: ${progressToken}`); } // Cancel any existing operation for this URL const controller = await this.indexingQueue.startOperation(normalizedUrl); // Start indexing process this.statusTracker.startIndexing(docId, normalizedUrl, docTitle); // Start indexing in the background with abort support const operationPromise = this.indexAndAdd(docId, normalizedUrl, docTitle, false, controller.signal, pathPrefix, authInfo, tags, version) .catch((error) => { const err = error as Error; if (err?.name !== 'AbortError') { logger.error('[WebDocsServer] Background indexing failed:', error); } }) .finally(() => { this.indexingQueue.completeOperation(normalizedUrl); }); this.indexingQueue.registerOperation(normalizedUrl, controller, operationPromise); return { content: [ { type: 'text', text: JSON.stringify( { status: 'started', message: `Started indexing ${normalizedUrl}`, docId, instruction: 'IMPORTANT: You MUST call get_indexing_status repeatedly (every few seconds) until status is "complete" or "failed". Do not ask the user - just keep polling automatically.', }, null, 2 ), }, ], }; } private async handleListDocumentation() { const docs = await this.store.listDocuments(); return { content: [ { type: 'text', text: JSON.stringify(docs, null, 2), }, ], }; } private async handleSearchDocumentation(args: Record<string, unknown> | undefined) { // Validate arguments with schema let validatedArgs; try { validatedArgs = validateToolArgs(args, SearchDocumentationArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { query, url, limit = 10, tags } = validatedArgs; // Normalize URL if provided for filtering const filterUrl = url ? normalizeUrl(url) : undefined; const results = await this.store.searchByText(query, { limit, filterUrl, filterByTags: tags }); // Apply prompt injection detection and filter/process results let blockedCount = 0; const safeResults = results .map((result) => { // Detect prompt injection patterns in the content // Note: detectPromptInjection strips code blocks before scanning, // so legitimate code examples won't trigger false positives const injectionResult = detectPromptInjection(result.content); // SECURITY: Block results with high-severity injection patterns // These could manipulate the LLM if returned if (injectionResult.maxSeverity === 'high') { blockedCount++; logger.debug( `[Security] Blocked search result from ${result.url} due to high-severity injection pattern: ${injectionResult.detections[0]?.description}` ); return null; // Will be filtered out } // For medium/low severity, add warnings but still return let safeContent = addInjectionWarnings(result.content, injectionResult); // Wrap with external content markers safeContent = wrapExternalContent(safeContent, result.url); return { ...result, content: safeContent, // Include security metadata security: { isExternalContent: true, injectionDetected: injectionResult.hasInjection, injectionSeverity: injectionResult.maxSeverity, detectionCount: injectionResult.detections.length, }, }; }) .filter((result): result is NonNullable<typeof result> => result !== null); // Build response with security notice if content was blocked const response: { results: typeof safeResults; securityNotice?: string } = { results: safeResults, }; if (blockedCount > 0) { response.securityNotice = `${blockedCount} result(s) were blocked due to high-severity prompt injection patterns detected in the content. This protects against potentially malicious content that could manipulate AI behavior.`; } return { content: [ { type: 'text', text: JSON.stringify(response, null, 2), }, ], }; } private async handleReindexDocumentation(args: Record<string, unknown> | undefined, progressToken?: ProgressToken) { // Validate arguments with schema let validatedArgs; try { validatedArgs = validateToolArgs(args, ReindexDocumentationArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { url } = validatedArgs; // Additional SSRF protection check if (!isValidPublicUrl(url)) { throw new McpError(ErrorCode.InvalidParams, 'Access to private networks is blocked'); } const normalizedUrl = normalizeUrl(url as string); const doc = await this.store.getDocument(normalizedUrl); if (!doc) { throw new McpError(ErrorCode.InvalidParams, 'Documentation not found'); } // Check if this site was originally indexed with authentication // If so, we MUST have a valid session to reindex if (doc.requiresAuth) { const authDomain = doc.authDomain || new URL(normalizedUrl).hostname; logger.info(`[WebDocsServer] Site requires auth (authDomain: ${authDomain}). Validating session...`); // Check if we have a session for this auth domain const hasSession = await this.authManager.hasSession(normalizedUrl); if (!hasSession) { throw new McpError( ErrorCode.InvalidParams, `This documentation site requires authentication but no session was found. Please use the 'authenticate' tool to log in before re-indexing.` ); } // Validate the session is still valid const validation = await this.authManager.validateSession(normalizedUrl); if (!validation.isValid) { logger.warn(`[WebDocsServer] Session expired for ${normalizedUrl}: ${validation.reason}`); // Clear the expired session await this.authManager.clearSession(normalizedUrl); throw new McpError( ErrorCode.InvalidParams, `Authentication session has expired (${validation.reason}). Please use the 'authenticate' tool to log in again before re-indexing.` ); } logger.info(`[WebDocsServer] ✓ Session validated for ${normalizedUrl}`); } // Prepare auth info to preserve with reindexed document const authInfo = doc.requiresAuth ? { requiresAuth: true, authDomain: doc.authDomain || new URL(normalizedUrl).hostname, } : undefined; // Preserve existing tags and version during reindex const existingTags = doc.tags; const existingVersion = doc.version; // Cancel any existing operation for this URL const wasCancelled = this.indexingQueue.isIndexing(normalizedUrl); const controller = await this.indexingQueue.startOperation(normalizedUrl); const docId = generateDocId(normalizedUrl, doc.title); // Store progress token if provided by client if (progressToken !== undefined) { this.progressTokens.set(docId, progressToken); logger.info(`[Progress] Registered token for ${docId}: ${progressToken}`); } this.statusTracker.startIndexing(docId, normalizedUrl, doc.title); // Start reindexing in the background with abort support (preserving existing tags and version) const operationPromise = this.indexAndAdd( docId, normalizedUrl, doc.title, true, controller.signal, undefined, authInfo, existingTags, existingVersion ) .catch((error) => { const err = error as Error; if (err?.name !== 'AbortError') { logger.error('[WebDocsServer] Background reindexing failed:', error); } }) .finally(() => { this.indexingQueue.completeOperation(normalizedUrl); }); this.indexingQueue.registerOperation(normalizedUrl, controller, operationPromise); return { content: [ { type: 'text', text: JSON.stringify( { status: 'started', message: wasCancelled ? `Started re-indexing ${normalizedUrl}. Previous operation was cancelled.` : `Started re-indexing ${normalizedUrl}`, docId, instruction: 'IMPORTANT: You MUST call get_indexing_status repeatedly (every few seconds) until status is "complete" or "failed". Do not ask the user - just keep polling automatically.', }, null, 2 ), }, ], }; } private handleGetIndexingStatus() { // Get only active operations and recently completed ones (auto-cleans old statuses) const statuses = this.statusTracker.getActiveStatuses(); // Check if any operations are still in progress const hasActiveOperations = statuses.some((s) => s.status === 'indexing'); // Add instruction for agent const response = { statuses, instruction: hasActiveOperations ? 'Operations still in progress. Call get_indexing_status again in a few seconds to check progress.' : 'All operations complete. No need to poll again.', }; return { content: [ { type: 'text', text: JSON.stringify(response, null, 2), }, ], }; } /** * Handle interactive authentication request. * Opens a visible browser for the user to login manually. */ private async handleAuthenticate(args: Record<string, unknown> | undefined) { // Validate arguments with schema let validatedArgs; try { validatedArgs = validateToolArgs(args, AuthenticateArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { url, browser, loginUrl, loginTimeoutSecs = 300 } = validatedArgs; // Additional SSRF protection check if (!isValidPublicUrl(url)) { throw new McpError(ErrorCode.InvalidParams, 'Access to private networks is blocked'); } const normalizedUrl = normalizeUrl(url); const domain = new URL(normalizedUrl).hostname; // Check if we already have a session and validate it const hasSession = await this.authManager.hasSession(normalizedUrl); if (hasSession) { // Validate that the existing session is still valid logger.info(`[Auth] Validating existing session for ${domain}...`); const validation = await this.authManager.validateSession(normalizedUrl); if (validation.isValid) { return { content: [ { type: 'text', text: JSON.stringify( { status: 'existing_session', message: `Already have a valid saved session for ${domain}. Use clear_auth first if you need to re-authenticate.`, domain, sessionValid: true, }, null, 2 ), }, ], }; } // Session is expired - clear it and proceed with new login logger.info(`[Auth] Existing session for ${domain} has expired (${validation.reason}). Proceeding with new login.`); await this.authManager.clearSession(normalizedUrl); } try { logger.info(`[Auth] Opening ${browser || 'auto-detected'} browser for authentication to ${domain}`); // Perform interactive login await this.authManager.performInteractiveLogin(normalizedUrl, { browser, loginUrl, loginTimeoutSecs, }); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Successfully authenticated to ${domain}. Session saved for future crawls.`, domain, instruction: 'You can now use add_documentation to crawl this site. The saved session will be used automatically.', }, null, 2 ), }, ], }; } catch (error) { const safeErrorMessage = sanitizeErrorMessage(error); logger.error(`[Auth] Authentication failed:`, safeErrorMessage); return { content: [ { type: 'text', text: JSON.stringify( { status: 'failed', message: `Authentication failed: ${safeErrorMessage}`, domain, }, null, 2 ), }, ], }; } } /** * Handle clearing saved authentication for a domain */ private async handleClearAuth(args: Record<string, unknown> | undefined) { // Validate arguments with schema let validatedArgs; try { validatedArgs = validateToolArgs(args, ClearAuthArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { url } = validatedArgs; const normalizedUrl = normalizeUrl(url); const domain = new URL(normalizedUrl).hostname; await this.authManager.clearSession(normalizedUrl); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Cleared saved authentication for ${domain}`, domain, }, null, 2 ), }, ], }; } /** * Handle deleting an indexed documentation site and all its data */ private async handleDeleteDocumentation(args: Record<string, unknown> | undefined) { // Validate arguments with schema let validatedArgs; try { validatedArgs = validateToolArgs(args, DeleteDocumentationArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { url, clearAuth = false } = validatedArgs; const normalizedUrl = normalizeUrl(url); const domain = new URL(normalizedUrl).hostname; // Check if document exists const doc = await this.store.getDocument(normalizedUrl); if (!doc) { return { content: [ { type: 'text', text: JSON.stringify( { status: 'not_found', message: `No indexed documentation found for ${normalizedUrl}`, url: normalizedUrl, }, null, 2 ), }, ], }; } const deletedItems: string[] = []; try { // 1. Delete from SQLite and LanceDB (via store) await this.store.deleteDocument(normalizedUrl); deletedItems.push('document metadata (SQLite)', 'vector chunks (LanceDB)'); logger.info(`[WebDocsServer] Deleted document from store: ${normalizedUrl}`); // 2. Delete Crawlee dataset const docId = generateDocId(normalizedUrl, doc.title); try { const { Dataset } = await import('crawlee'); const dataset = await Dataset.open(docId); await dataset.drop(); deletedItems.push('crawl cache (Crawlee dataset)'); logger.info(`[WebDocsServer] Deleted Crawlee dataset: ${docId}`); } catch { logger.debug(`[WebDocsServer] No Crawlee dataset to delete for ${docId}`); } // 3. Optionally clear auth session if (clearAuth as boolean) { await this.authManager.clearSession(normalizedUrl); deletedItems.push('authentication session'); logger.info(`[WebDocsServer] Cleared auth session for ${domain}`); } // Optimize storage after deletion to reclaim space // This runs in the background and doesn't block the response this.store.optimize().catch((err) => { logger.warn('[WebDocsServer] Background optimization after delete failed:', err); }); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Successfully deleted documentation for ${normalizedUrl}`, url: normalizedUrl, title: doc.title, deletedItems, }, null, 2 ), }, ], }; } catch (error) { const safeErrorMessage = sanitizeErrorMessage(error); logger.error(`[WebDocsServer] Error deleting documentation:`, safeErrorMessage); return { content: [ { type: 'text', text: JSON.stringify( { status: 'error', message: `Failed to delete documentation: ${safeErrorMessage}`, url: normalizedUrl, deletedItems, }, null, 2 ), }, ], }; } } /** * Handle setting tags for a documentation site */ private async handleSetTags(args: Record<string, unknown> | undefined) { // Validate arguments with schema let validatedArgs; try { validatedArgs = validateToolArgs(args, SetTagsArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { url, tags } = validatedArgs; const normalizedUrl = normalizeUrl(url); try { await this.store.setTags(normalizedUrl, tags); // Get the updated document to return current state const doc = await this.store.getDocument(normalizedUrl); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Successfully updated tags for ${normalizedUrl}`, url: normalizedUrl, title: doc?.title, tags: doc?.tags || [], }, null, 2 ), }, ], }; } catch (error) { const safeErrorMessage = sanitizeErrorMessage(error); // Check for "Documentation not found" error if (safeErrorMessage.includes('Documentation not found')) { throw new McpError(ErrorCode.InvalidParams, `Documentation not found for URL: ${normalizedUrl}`); } throw new McpError(ErrorCode.InternalError, `Failed to set tags: ${safeErrorMessage}`); } } /** * Handle listing all tags with usage counts */ private async handleListTags() { const tags = await this.store.listAllTags(); return { content: [ { type: 'text', text: JSON.stringify( { tags, total: tags.length, }, null, 2 ), }, ], }; } // ============ Collection Handlers ============ /** * Handle creating a new collection */ private async handleCreateCollection(args: Record<string, unknown> | undefined) { let validatedArgs; try { validatedArgs = validateToolArgs(args, CreateCollectionArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { name, description } = validatedArgs; try { await this.store.createCollection(name, description); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Collection "${name}" created successfully`, collection: { name, description, }, }, null, 2 ), }, ], }; } catch (error) { const safeMessage = sanitizeErrorMessage(error); if (safeMessage.includes('already exists')) { throw new McpError(ErrorCode.InvalidParams, safeMessage); } throw new McpError(ErrorCode.InternalError, `Failed to create collection: ${safeMessage}`); } } /** * Handle deleting a collection */ private async handleDeleteCollection(args: Record<string, unknown> | undefined) { let validatedArgs; try { validatedArgs = validateToolArgs(args, DeleteCollectionArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { name } = validatedArgs; try { await this.store.deleteCollection(name); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Collection "${name}" deleted. Documentation sites remain indexed.`, }, null, 2 ), }, ], }; } catch (error) { const safeMessage = sanitizeErrorMessage(error); if (safeMessage.includes('not found')) { throw new McpError(ErrorCode.InvalidParams, safeMessage); } throw new McpError(ErrorCode.InternalError, `Failed to delete collection: ${safeMessage}`); } } /** * Handle updating a collection's metadata */ private async handleUpdateCollection(args: Record<string, unknown> | undefined) { let validatedArgs; try { validatedArgs = validateToolArgs(args, UpdateCollectionArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { name, newName, description } = validatedArgs; // Must provide at least one field to update if (newName === undefined && description === undefined) { throw new McpError(ErrorCode.InvalidParams, 'Must provide newName or description to update'); } try { await this.store.updateCollection(name, { newName, description }); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Collection updated successfully`, collection: { name: newName ?? name, description, }, }, null, 2 ), }, ], }; } catch (error) { const safeMessage = sanitizeErrorMessage(error); if (safeMessage.includes('not found') || safeMessage.includes('already exists')) { throw new McpError(ErrorCode.InvalidParams, safeMessage); } throw new McpError(ErrorCode.InternalError, `Failed to update collection: ${safeMessage}`); } } /** * Handle listing all collections */ private async handleListCollections() { const collections = await this.store.listCollections(); return { content: [ { type: 'text', text: JSON.stringify( { collections, total: collections.length, }, null, 2 ), }, ], }; } /** * Handle getting a specific collection with its documents */ private async handleGetCollection(args: Record<string, unknown> | undefined) { let validatedArgs; try { validatedArgs = validateToolArgs(args, GetCollectionArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { name } = validatedArgs; const collection = await this.store.getCollection(name); if (!collection) { throw new McpError(ErrorCode.InvalidParams, `Collection "${name}" not found`); } return { content: [ { type: 'text', text: JSON.stringify(collection, null, 2), }, ], }; } /** * Handle adding documents to a collection */ private async handleAddToCollection(args: Record<string, unknown> | undefined) { let validatedArgs; try { validatedArgs = validateToolArgs(args, AddToCollectionArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { name, urls } = validatedArgs; // Normalize URLs const normalizedUrls = urls.map((url) => normalizeUrl(url)); try { const result = await this.store.addToCollection(name, normalizedUrls); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Added ${result.added.length} document(s) to collection "${name}"`, ...result, }, null, 2 ), }, ], }; } catch (error) { const safeMessage = sanitizeErrorMessage(error); if (safeMessage.includes('not found')) { throw new McpError(ErrorCode.InvalidParams, safeMessage); } throw new McpError(ErrorCode.InternalError, `Failed to add to collection: ${safeMessage}`); } } /** * Handle removing documents from a collection */ private async handleRemoveFromCollection(args: Record<string, unknown> | undefined) { let validatedArgs; try { validatedArgs = validateToolArgs(args, RemoveFromCollectionArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { name, urls } = validatedArgs; // Normalize URLs const normalizedUrls = urls.map((url) => normalizeUrl(url)); try { const result = await this.store.removeFromCollection(name, normalizedUrls); return { content: [ { type: 'text', text: JSON.stringify( { status: 'success', message: `Removed ${result.removed.length} document(s) from collection "${name}"`, ...result, }, null, 2 ), }, ], }; } catch (error) { const safeMessage = sanitizeErrorMessage(error); if (safeMessage.includes('not found')) { throw new McpError(ErrorCode.InvalidParams, safeMessage); } throw new McpError(ErrorCode.InternalError, `Failed to remove from collection: ${safeMessage}`); } } /** * Handle searching within a collection */ private async handleSearchCollection(args: Record<string, unknown> | undefined) { let validatedArgs; try { validatedArgs = validateToolArgs(args, SearchCollectionArgsSchema); } catch (error) { throw new McpError(ErrorCode.InvalidParams, sanitizeErrorMessage(error)); } const { name, query, limit = 10 } = validatedArgs; // Get URLs in the collection const collectionUrls = await this.store.getCollectionUrls(name); if (collectionUrls.length === 0) { // Check if collection exists but is empty const collection = await this.store.getCollection(name); if (!collection) { throw new McpError(ErrorCode.InvalidParams, `Collection "${name}" not found`); } return { content: [ { type: 'text', text: JSON.stringify( { results: [], message: `Collection "${name}" is empty. Add documentation sites to search.`, }, null, 2 ), }, ], }; } // Search using the existing searchByText but filter by collection URLs // We'll use a custom approach: search all then filter // This is less efficient than a native filter, but works with existing infrastructure const allResults = await this.store.searchByText(query, { limit: limit * 3 }); // Get more results to filter // Filter to only include results from collection URLs const collectionUrlSet = new Set(collectionUrls); let filteredResults = allResults.filter((result) => collectionUrlSet.has(result.url)); // Apply limit filteredResults = filteredResults.slice(0, limit); // Apply prompt injection detection and filter/process results (same as handleSearchDocumentation) let blockedCount = 0; const safeResults = filteredResults .map((result) => { const injectionResult = detectPromptInjection(result.content); if (injectionResult.maxSeverity === 'high') { blockedCount++; logger.debug( `[Security] Blocked search result from ${result.url} due to high-severity injection pattern: ${injectionResult.detections[0]?.description}` ); return null; } let safeContent = addInjectionWarnings(result.content, injectionResult); safeContent = wrapExternalContent(safeContent, result.url); return { ...result, content: safeContent, security: { isExternalContent: true, injectionDetected: injectionResult.hasInjection, injectionSeverity: injectionResult.maxSeverity, detectionCount: injectionResult.detections.length, }, }; }) .filter((result): result is NonNullable<typeof result> => result !== null); const response: { results: typeof safeResults; collection: string; securityNotice?: string } = { results: safeResults, collection: name, }; if (blockedCount > 0) { response.securityNotice = `${blockedCount} result(s) were blocked due to high-severity prompt injection patterns.`; } return { content: [ { type: 'text', text: JSON.stringify(response, null, 2), }, ], }; } private async indexAndAdd( id: string, url: string, title: string, reIndex: boolean = false, signal?: AbortSignal, pathPrefix?: string, authInfo?: { requiresAuth: boolean; authDomain: string }, tags?: string[], version?: string ) { // Helper to check if operation was cancelled const checkCancelled = () => { if (signal?.aborted) { logger.info(`[WebDocsServer] Operation cancelled for ${url}`); this.statusTracker.cancelIndexing(id); const error = new Error('Operation cancelled'); error.name = 'AbortError'; throw error; } }; try { logger.info(`[WebDocsServer] Starting indexAndAdd for ${url} (reIndex: ${reIndex})`); checkCancelled(); // Check if document exists logger.debug(`[WebDocsServer] Checking if document exists: ${url}`); const existingDoc = await this.store.getDocument(url); if (existingDoc) { logger.debug(`[WebDocsServer] Document exists: ${url}`); if (!reIndex) { logger.info(`[WebDocsServer] Document ${url} already indexed and reIndex=false`); this.statusTracker.completeIndexing(id); return; } logger.info(`[WebDocsServer] Will reindex existing document: ${url}`); } else { logger.debug(`[WebDocsServer] Document does not exist: ${url}`); } checkCancelled(); // Start crawling logger.info( `[WebDocsServer] Starting crawl with depth=${this.config.maxDepth}, maxRequests=${this.config.maxRequestsPerCrawl}${pathPrefix ? `, pathPrefix=${pathPrefix}` : ''}` ); this.statusTracker.updateProgress(id, 0, 'Finding subpages'); const crawler = new DocsCrawler(this.config.maxDepth, this.config.maxRequestsPerCrawl, this.config.githubToken); // Set path prefix restriction if provided if (pathPrefix) { crawler.setPathPrefix(pathPrefix); } // Load saved authentication session if available const savedSession = await this.authManager.loadSession(url); if (savedSession) { try { // Validate the session structure before using it const validatedState: ValidatedStorageState = safeJsonParse(savedSession, StorageStateSchema); // The validated state is structurally compatible with StorageState crawler.setStorageState(validatedState as StorageState); logger.info(`[WebDocsServer] Using validated authentication session for ${url}`); } catch (e) { logger.warn(`[WebDocsServer] Failed to parse or validate saved session:`, e); // Continue without authentication rather than failing } } const pages = []; let processedPages = 0; let estimatedProgress = 0; logger.info(`[WebDocsServer] Starting page crawl for ${url}`); for await (const page of crawler.crawl(url)) { // Check for cancellation during crawl if (signal?.aborted) { logger.info(`[WebDocsServer] Crawl cancelled for ${url}`); crawler.abort(); this.statusTracker.cancelIndexing(id); const error = new Error('Operation cancelled'); error.name = 'AbortError'; throw error; } logger.debug(`[WebDocsServer] Found page ${processedPages + 1}: ${page.path}`); processedPages++; estimatedProgress += 1 / 2 ** processedPages; this.statusTracker.updateProgress( id, 0.15 * estimatedProgress + Math.min(0.35, (0.35 * processedPages) / 500), `Finding subpages (${page.path})` ); this.statusTracker.updateStats(id, { pagesFound: processedPages }); pages.push(page); // Small delay to allow other operations await new Promise((resolve) => setTimeout(resolve, 50)); } if (pages.length === 0) { logger.warn('[WebDocsServer] No pages found during crawl'); throw new Error('No pages found to index'); } logger.info(`[WebDocsServer] Found ${pages.length} pages to process`); logger.info('[WebDocsServer] Starting content processing and embedding generation'); this.statusTracker.updateStats(id, { pagesFound: pages.length }); checkCancelled(); // Process pages and create embeddings const chunks: DocumentChunk[] = []; const embeddings: number[][] = []; for (let i = 0; i < pages.length; i++) { checkCancelled(); const page = pages[i]; logger.debug(`[WebDocsServer] Processing page ${i + 1}/${pages.length}: ${page.path}`); this.statusTracker.updateProgress(id, 0.5 + 0.3 * (i / pages.length), `Creating embeddings (${i + 1}/${pages.length})`); try { const processed = await this.processor.process(page); logger.debug(`[WebDocsServer] Created ${processed.chunks.length} chunks for ${page.path}`); chunks.push(...processed.chunks); embeddings.push(...processed.chunks.map((chunk) => chunk.vector)); this.statusTracker.updateStats(id, { pagesProcessed: i + 1, chunksCreated: chunks.length, }); } catch (error) { logger.error(`[WebDocsServer] Error processing page ${page.path}:`, error); } // Small delay await new Promise((resolve) => setTimeout(resolve, 20)); } logger.info(`[WebDocsServer] Total chunks created: ${chunks.length}`); // Scan for potential prompt injection patterns in indexed content // Note: Detection is informational only. Logs are at DEBUG level to reduce noise // from legitimate AI documentation (which contains prompt examples). let injectionWarnings = 0; for (const chunk of chunks) { const injectionResult = detectPromptInjection(chunk.content); if (injectionResult.hasInjection) { injectionWarnings++; if (injectionResult.maxSeverity === 'high') { logger.debug( `[Security] Prompt injection pattern detected in ${chunk.path || 'unknown'}: ${injectionResult.detections[0]?.description}` ); } } } if (injectionWarnings > 0) { logger.debug( `[Security] Detected ${injectionWarnings} chunks with potential prompt injection patterns in ${url}. Content will be marked when returned in search results.` ); } if (embeddings.length === 0) { logger.warn(`[WebDocsServer] No content was extracted from ${url}`); logger.warn(`[WebDocsServer] Pages found: ${pages.length}`); logger.warn(`[WebDocsServer] Chunks created: ${chunks.length}`); this.statusTracker.failIndexing(id, 'No content was extracted from the pages'); return; } checkCancelled(); // Delete old data if reindexing if (reIndex && existingDoc) { this.statusTracker.updateProgress(id, 0.8, 'Deleting old data'); await this.store.deleteDocument(url); } checkCancelled(); // Get favicon const favicon = await fetchFavicon(new URL(url)); // Store the data with retry logic this.statusTracker.updateProgress(id, 0.9, `Storing ${embeddings.length} chunks`); await this.addDocumentWithRetry({ metadata: { url, title, favicon: favicon ?? undefined, lastIndexed: new Date(), requiresAuth: authInfo?.requiresAuth, authDomain: authInfo?.authDomain, version, }, chunks: chunks.map((chunk, i) => ({ ...chunk, vector: embeddings[i], })), }); // Always update tags when indexing (clears old tags if none provided) await this.store.setTags(url, tags || []); if (tags && tags.length > 0) { logger.info(`[WebDocsServer] Tags set for ${url}:`, tags); } else { logger.debug(`[WebDocsServer] Tags cleared for ${url}`); } logger.info(`[WebDocsServer] Successfully indexed ${url}`); logger.info(`[WebDocsServer] Pages processed: ${pages.length}`); logger.info(`[WebDocsServer] Chunks stored: ${chunks.length}`); this.statusTracker.updateStats(id, { chunksCreated: chunks.length }); this.statusTracker.completeIndexing(id); // Optimize storage after indexing to compact data and clean up old versions // This runs in the background and doesn't block the response this.store.optimize().catch((err) => { logger.warn('[WebDocsServer] Background optimization failed:', err); }); } catch (error) { // Don't log AbortError as a real error if (error instanceof Error && error.name === 'AbortError') { logger.info(`[WebDocsServer] Indexing cancelled for ${url}`); return; } // Handle expired session errors specially if (error instanceof SessionExpiredError) { logger.warn(`[WebDocsServer] Session expired during crawl of ${url}: ${error.message}`); logger.warn(`[WebDocsServer] Expected URL: ${error.expectedUrl}, Detected URL: ${error.detectedUrl}`); // Clear the expired session await this.authManager.clearSession(url); logger.info(`[WebDocsServer] Cleared expired session for ${url}`); // Report user-friendly error const userMessage = `Authentication session has expired. The crawler was redirected to a login page. Please use the 'authenticate' tool to log in again before re-indexing.`; this.statusTracker.failIndexing(id, userMessage); return; } logger.error('[WebDocsServer] Error during indexing:', error); logger.error('[WebDocsServer] Error details:', error instanceof Error ? error.stack : error); this.statusTracker.failIndexing(id, error instanceof Error ? error.message : 'Unknown error'); } } /** * Add document with retry logic for transient database conflicts */ private async addDocumentWithRetry( doc: { metadata: { url: string; title: string; favicon?: string; lastIndexed: Date; requiresAuth?: boolean; authDomain?: string; version?: string; }; chunks: DocumentChunk[]; }, maxRetries = 3 ): Promise<void> { for (let attempt = 1; attempt <= maxRetries; attempt++) { try { await this.store.addDocument(doc); return; } catch (error) { const isConflict = error instanceof Error && error.message?.includes('Commit conflict'); if (isConflict && attempt < maxRetries) { logger.warn(`[WebDocsServer] Database conflict, retrying (${attempt}/${maxRetries})...`); await new Promise((resolve) => setTimeout(resolve, 1000 * attempt)); // Exponential backoff continue; } throw error; } } } async run() { // Initialize components await this.initialize(); // Connect to stdio transport const transport = new StdioServerTransport(); await this.server.connect(transport); logger.info('Web Docs MCP server running on stdio'); } } // Start server const server = new WebDocsServer(); server.run().catch((err) => logger.error('Server failed to start:', err)); // Handle process signals - cancel all operations before shutdown process.on('SIGINT', async () => { logger.info('Received SIGINT, cancelling operations and shutting down...'); process.exit(0); }); process.on('SIGTERM', async () => { logger.info('Received SIGTERM, cancelling operations and shutting down...'); process.exit(0); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cosmocoder/mcp-web-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•69.5 KiB