Docs Fetch MCP Server

MIT License

Overview InspectNew Endpoints Schema Related Servers Reviews Score

docs-fetch-mcp
src

server.ts•15.5 kB

import { Server } from '@modelcontextprotocol/sdk/server/index.js'; import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; import { CallToolRequestSchema, ErrorCode, ListToolsRequestSchema, McpError, } from '@modelcontextprotocol/sdk/types.js'; import puppeteer from 'puppeteer'; import axios from 'axios'; /** * Simplified MCP server for fetching web content. * * This server provides a streamlined implementation that focuses on reliability * and avoids timeout issues. It fetches web page content and can explore linked * pages up to a specified depth. */ export class DocsFetchServer { private server: Server; private visitedUrls: Set<string> = new Set(); private globalTimeout: NodeJS.Timeout | null = null; private timeoutDuration = 45000; // 45 seconds - below MCP's 60s timeout constructor() { this.server = new Server( { name: 'docs-fetch-server', version: '1.0.0', }, { capabilities: { tools: {}, }, } ); this.setupTools(); this.setupErrorHandling(); } private setupErrorHandling(): void { this.server.onerror = (error) => console.error('[MCP Error]', error); process.on('SIGINT', async () => { if (this.globalTimeout) { clearTimeout(this.globalTimeout); } await this.server.close(); process.exit(0); }); } private setupTools(): void { this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: 'fetch_doc_content', description: 'Fetch web page content with the ability to explore linked pages up to a specified depth', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'URL of the web page to fetch', }, depth: { type: 'number', description: 'Maximum depth of directory/link exploration (default: 1)', minimum: 1, maximum: 5, }, }, required: ['url'], }, }, ], })); this.server.setRequestHandler(CallToolRequestSchema, async (request) => { if (request.params.name === 'fetch_doc_content') { const { url, depth = 1 } = request.params.arguments as { url: string; depth?: number; }; // Reset visited URLs for each new request this.visitedUrls.clear(); // Setup global timeout let timeoutError: Error | null = null; const timeoutPromise = new Promise<never>((_, reject) => { this.globalTimeout = setTimeout(() => { timeoutError = new Error('Operation timed out'); reject(timeoutError); }, this.timeoutDuration); }); try { // Race the content fetch against our timeout const result = await Promise.race([ this.fetchContent(url, depth), timeoutPromise ]); // Clear timeout if we didn't hit it if (this.globalTimeout) { clearTimeout(this.globalTimeout); this.globalTimeout = null; } return { content: [ { type: 'text', text: JSON.stringify(result, null, 2), }, ], }; } catch (error: unknown) { // Clear timeout if we hit an error if (this.globalTimeout) { clearTimeout(this.globalTimeout); this.globalTimeout = null; } let errorMessage = "Unknown error occurred"; if (error === timeoutError) { errorMessage = "Operation timed out before completion"; } else if (error instanceof Error) { errorMessage = error.message; } else if (typeof error === 'string') { errorMessage = error; } return { content: [ { type: 'text', text: `Error fetching content: ${errorMessage}`, }, ], isError: true, }; } } throw new McpError( ErrorCode.MethodNotFound, `Unknown tool: ${request.params.name}` ); }); } /** * Main content fetching function with simplified implementation */ private async fetchContent(url: string, maxDepth: number): Promise<any> { // Track visited URLs to avoid cycles this.visitedUrls.add(url); try { // First try a simple fetch with axios as a faster alternative try { const response = await axios.get(url, { timeout: 10000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); // If we got HTML content, use it if (response.status === 200 && response.data && typeof response.data === 'string') { const mainContent = this.extractTextContent(response.data); const links = this.extractLinks(response.data, url); // For depth=1, just return the main content if (maxDepth <= 1) { return { rootUrl: url, explorationDepth: maxDepth, pagesExplored: 1, content: [{ url, content: mainContent, links: links.slice(0, 10) // Limit to top 10 links }] }; } // For depth > 1, explore child links const childResults = []; const limit = Math.min(5, links.length); // Limit to 5 links for performance for (let i = 0; i < limit; i++) { const link = links[i]; if (!this.visitedUrls.has(link.url)) { try { const childContent = await this.fetchSimpleContent(link.url); if (childContent) { childResults.push({ url: link.url, content: childContent, links: [] // Don't include links for child pages }); this.visitedUrls.add(link.url); } } catch (e) { // Ignore errors on child pages } } } return { rootUrl: url, explorationDepth: maxDepth, pagesExplored: 1 + childResults.length, content: [{ url, content: mainContent, links: links.slice(0, 10) }, ...childResults] }; } } catch (e) { // Fall back to puppeteer if axios fails console.error('Axios fetch failed, falling back to puppeteer:', e); } // Fall back to puppeteer for more complex pages return await this.fetchWithPuppeteer(url, maxDepth); } catch (error) { console.error('Error in content fetch:', error); // Return partial results if we have visited any URLs if (this.visitedUrls.size > 0) { return { rootUrl: url, explorationDepth: maxDepth, pagesExplored: this.visitedUrls.size, content: [], error: error instanceof Error ? error.message : String(error) }; } throw error; } } /** * Simple content fetching for child pages */ private async fetchSimpleContent(url: string): Promise<string | null> { try { const response = await axios.get(url, { timeout: 5000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } }); if (response.status === 200 && response.data && typeof response.data === 'string') { return this.extractTextContent(response.data); } return null; } catch (e) { return null; } } /** * Extract plain text content from HTML */ private extractTextContent(html: string): string { // Very basic HTML to text conversion let text = html .replace(/<head>[\s\S]*?<\/head>/i, '') .replace(/<script[\s\S]*?<\/script>/gi, '') .replace(/<style[\s\S]*?<\/style>/gi, '') .replace(/<[^>]*>/g, ' ') .replace(/\s+/g, ' ') .trim(); // Truncate if too long if (text.length > 10000) { text = text.substring(0, 10000) + '... (content truncated)'; } return text; } /** * Extract links from HTML */ private extractLinks(html: string, baseUrl: string): Array<{url: string, text: string}> { const links: Array<{url: string, text: string}> = []; const linkRegex = /<a\s+(?:[^>]*?\s+)?href="([^"]*)"(?:\s+[^>]*?)?>([^<]*)<\/a>/gi; let match; while ((match = linkRegex.exec(html)) !== null) { const href = match[1].trim(); const text = match[2].trim(); // Skip empty links or special protocols if (!href || href.startsWith('#') || href.startsWith('javascript:') || href.startsWith('mailto:') || href.startsWith('tel:')) { continue; } try { // Resolve relative URLs const fullUrl = new URL(href, baseUrl).href; // Only include links from same domain if (new URL(fullUrl).hostname === new URL(baseUrl).hostname) { links.push({ url: fullUrl, text: text || fullUrl }); } } catch (e) { // Skip invalid URLs } } return links; } /** * Fetch content using puppeteer for more complex pages */ private async fetchWithPuppeteer(url: string, maxDepth: number): Promise<any> { let browser = null; try { // Launch browser with minimal options browser = await puppeteer.launch({ headless: 'new', args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'] }); const page = await browser.newPage(); await page.setDefaultNavigationTimeout(15000); await page.setRequestInterception(true); // Block unnecessary resources page.on('request', (request) => { const resourceType = request.resourceType(); if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) { request.abort(); } else { request.continue(); } }); // Navigate to the page await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15000 }); // Wait briefly for any scripts to run await page.waitForTimeout(1000); // Extract content const content = await page.evaluate(() => { // Try to find main content const selectors = [ '.markdown-body', '.readme', '.documentation', '[role="main"]', 'main', 'article', '.content', '#content', '.main-content', '#main-content', '.docs-content', '.docs-body', '.docs-markdown', 'body' ]; let mainElement = null; for (const selector of selectors) { const elements = document.querySelectorAll(selector); if (elements.length > 0) { mainElement = elements[0]; break; } } // Fallback to body mainElement = mainElement || document.body; // Get text content const mainContent = mainElement ? mainElement.textContent || '' : ''; // Extract links const links = Array.from(document.querySelectorAll('a[href]')) .map(link => { const href = link.getAttribute('href'); const text = link.textContent || ''; if (!href || href.startsWith('#') || href.startsWith('javascript:') || href.startsWith('mailto:') || href.startsWith('tel:')) { return null; } return { url: href, text: text.trim() || href }; }) .filter(Boolean); return { mainContent: mainContent.trim(), links }; }); // Clean content const mainContent = content.mainContent .replace(/\s+/g, ' ') .trim(); // Truncate if too long const truncatedContent = mainContent.length > 10000 ? mainContent.substring(0, 10000) + '... (content truncated)' : mainContent; // Resolve and filter links const links = content.links .map((link: any) => { try { const fullUrl = new URL(link.url, url).href; return { url: fullUrl, text: link.text }; } catch (e) { return null; } }) .filter(Boolean) .filter((link: any) => { try { return new URL(link.url).hostname === new URL(url).hostname; } catch (e) { return false; } }); // For depth=1, just return the main content if (maxDepth <= 1 || links.length === 0) { return { rootUrl: url, explorationDepth: maxDepth, pagesExplored: 1, content: [{ url, content: truncatedContent, links: links.slice(0, 10) // Limit to top 10 links }] }; } // For depth > 1, explore a few child links const childResults = []; const limit = Math.min(3, links.length); // Strict limit to avoid timeouts for (let i = 0; i < limit; i++) { const link = links[i]; if (link && !this.visitedUrls.has(link.url)) { try { const childContent = await this.fetchSimpleContent(link.url); if (childContent) { childResults.push({ url: link.url, content: childContent, links: [] // Don't include links for child pages }); this.visitedUrls.add(link.url); } } catch (e) { // Ignore errors on child pages } } } return { rootUrl: url, explorationDepth: maxDepth, pagesExplored: 1 + childResults.length, content: [{ url, content: truncatedContent, links: links.slice(0, 10) }, ...childResults] }; } catch (error) { console.error('Puppeteer fetch error:', error); throw error; } finally { if (browser) { await browser.close(); } } } /** * Start the MCP server */ async run(): Promise<void> { try { const transport = new StdioServerTransport(); console.error('Connecting to transport...'); await this.server.connect(transport); console.error('Documentation fetch MCP server running on stdio'); } catch (error) { console.error('Error starting server:', error); if (error instanceof Error) { console.error('Error stack:', error.stack); } throw error; } } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wolfyy970/docs-fetch-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server