read_website
Extract and convert web content to clean Markdown for efficient analysis, documentation reading, and information gathering. Preserves links and structure for clarity and usability.
Instructions
Fast, token-efficient web content extraction - ideal for reading documentation, analyzing content, and gathering information from websites. Converts to clean Markdown while preserving links and structure.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| pages | No | Maximum number of pages to crawl (default: 1) | |
| url | Yes | HTTP/HTTPS URL to fetch and convert to markdown |
Implementation Reference
- src/internal/fetchMarkdown.ts:23-127 (handler)Core handler function that fetches website content, optionally crawls linked pages, extracts markdown, and combines results.export async function fetchMarkdown( url: string, options: FetchMarkdownOptions = {} ): Promise<FetchMarkdownResult> { try { const maxPages = options.maxPages ?? 1; const visited = new Set<string>(); const toVisit = [url]; const allResults: any[] = []; // If we want multiple pages, we need to crawl iteratively while (toVisit.length > 0 && allResults.length < maxPages) { const currentUrl = toVisit.shift()!; // Skip if already visited if (visited.has(currentUrl)) continue; visited.add(currentUrl); // Fetch single page const crawlOptions: CrawlOptions = { depth: 0, // Always single page maxConcurrency: options.maxConcurrency ?? 3, respectRobots: options.respectRobots ?? true, sameOriginOnly: options.sameOriginOnly ?? true, userAgent: options.userAgent, cacheDir: options.cacheDir ?? '.cache', timeout: options.timeout ?? 30000, }; if (options.cookiesFile) { (crawlOptions as any).cookiesFile = options.cookiesFile; } const results = await fetch(currentUrl, crawlOptions); if (results && results.length > 0) { const result = results[0]; allResults.push(result); // Extract links from markdown if we need more pages if (allResults.length < maxPages && result.markdown) { const links = extractMarkdownLinks(result.markdown, currentUrl); const filteredLinks = options.sameOriginOnly !== false ? filterSameOriginLinks(links, currentUrl) : links; // Add new links to visit queue for (const link of filteredLinks) { if (!visited.has(link) && !toVisit.includes(link)) { toVisit.push(link); } } } } } if (allResults.length === 0) { return { markdown: '', error: 'No results returned', }; } // Process results as before const pagesToReturn = allResults; // Combine all pages into a single markdown document const combinedMarkdown = pagesToReturn .map((result, index) => { if (result.error) { return `<!-- Error fetching ${result.url}: ${result.error} -->`; } let pageContent = ''; // Add page separator for multiple pages if (pagesToReturn.length > 1 && index > 0) { pageContent += '\n\n---\n\n'; } // Add source URL as a comment pageContent += `<!-- Source: ${result.url} -->\n`; // Add the content pageContent += result.markdown || ''; return pageContent; }) .join('\n'); // Return combined results return { markdown: combinedMarkdown, title: pagesToReturn[0].title, links: pagesToReturn.flatMap(r => r.links || []), error: pagesToReturn.some(r => r.error) ? `Some pages had errors: ${pagesToReturn.filter(r => r.error).map(r => r.url).join(', ')}` : undefined, }; } catch (error) { return { markdown: '', error: error instanceof Error ? error.message : 'Unknown error', }; } }
- src/serve.ts:117-198 (handler)MCP CallToolRequestSchema handler that validates input, lazy loads and calls the fetchMarkdown helper for 'read_website' tool.server.setRequestHandler(CallToolRequestSchema, async request => { logger.info('Received CallTool request:', request.params.name); logger.debug('Request params:', JSON.stringify(request.params, null, 2)); if (request.params.name !== 'read_website') { const error = `Unknown tool: ${request.params.name}`; logger.error(error); throw new Error(error); } try { // Lazy load the module on first use if (!fetchMarkdownModule) { logger.debug('Lazy loading fetchMarkdown module...'); fetchMarkdownModule = await import('./internal/fetchMarkdown.js'); logger.info('fetchMarkdown module loaded successfully'); } const args = request.params.arguments as any; // Validate URL if (!args.url || typeof args.url !== 'string') { throw new Error('URL parameter is required and must be a string'); } logger.info(`Processing read request for URL: ${args.url}`); logger.debug('Read parameters:', { url: args.url, pages: args.pages, cookiesFile: args.cookiesFile, }); logger.debug('Calling fetchMarkdown...'); // Convert pages to depth (pages - 1 = depth) // pages: 1 = depth: 0 (single page) // pages: 2+ = depth: 1 (crawl one level to get multiple pages) const depth = args.pages > 1 ? 1 : 0; const result = await fetchMarkdownModule.fetchMarkdown(args.url, { depth: depth, respectRobots: false, // Default to not respecting robots.txt maxPages: args.pages ?? 1, cookiesFile: args.cookiesFile, }); logger.info('Content fetched successfully'); // If there's an error but we still have some content, return it with a note if (result.error && result.markdown) { return { content: [ { type: 'text', text: `${result.markdown}\n\n---\n*Note: ${result.error}*`, }, ], }; } // If there's an error and no content, throw it if (result.error && !result.markdown) { throw new Error(result.error); } return { content: [{ type: 'text', text: result.markdown }], }; } catch (error: any) { logger.error('Error fetching content:', error.message); logger.debug('Error stack:', error.stack); logger.debug('Error details:', { name: error.name, code: error.code, ...error, }); // Re-throw with more context throw new Error( `Failed to fetch content: ${error instanceof Error ? error.message : 'Unknown error'}` ); } });
- src/serve.ts:52-85 (schema)Complete tool definition including name, description, input schema, and annotations for the 'read_website' MCP tool.const READ_WEBSITE_TOOL: Tool = { name: 'read_website', description: 'Fast, token-efficient web content extraction - ideal for reading documentation, analyzing content, and gathering information from websites. Converts to clean Markdown while preserving links and structure.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'HTTP/HTTPS URL to fetch and convert to markdown', }, pages: { type: 'number', description: 'Maximum number of pages to crawl (default: 1)', default: 1, minimum: 1, maximum: 100, }, cookiesFile: { type: 'string', description: 'Path to Netscape cookie file for authenticated pages', optional: true, }, }, required: ['url'], }, annotations: { title: 'Read Website', readOnlyHint: true, // Only reads content destructiveHint: false, idempotentHint: true, // Same URL returns same content (with cache) openWorldHint: true, // Interacts with external websites }, };
- src/serve.ts:104-114 (registration)Registers the 'read_website' tool by including it in the ListTools response.server.setRequestHandler(ListToolsRequestSchema, async () => { logger.debug('Received ListTools request'); const response = { tools: [READ_WEBSITE_TOOL], }; logger.debug( 'Returning tools:', response.tools.map(t => t.name) ); return response; });