get_webpage_content
Extract webpage content and convert it to Markdown, HTML, or plain text format for analysis and processing.
Instructions
Fetch webpage content and convert to specified format. Supports Markdown, HTML, and plain text.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | The URL of the webpage to scrape. Must be a valid HTTP/HTTPS link. | |
| format | No | Output format: markdown (default), html, text | markdown |
Implementation Reference
- src/mcp/server.js:216-251 (handler)Main handler function executing the tool logic: validates input, fetches content via service, formats response.async function handleGetWebpageContent(args) { const { url, format = 'markdown' } = args; if (!url || typeof url !== 'string') { throw new Error('URL parameter is required and must be a string'); } try { new URL(url); } catch (error) { throw new Error('Invalid URL format'); } if (!['markdown', 'html', 'text'].includes(format)) { throw new Error('format must be one of: markdown, html, text'); } const searchService = (await import('../services/searchService.js')).default; let result; if (format === 'markdown') { result = await searchService.getWebpageMarkdown(url); } else { result = await searchService.scrapeWebpage(url); } return { tool: 'get_webpage_content', url, format, title: result.title, description: result.description, content: format === 'markdown' ? result.markdown : result.content, timestamp: result.timestamp }; }
- src/mcp/server.js:40-59 (schema)Input schema and metadata definition for the get_webpage_content tool, used for registration and validation.{ name: 'get_webpage_content', description: 'Fetch webpage content and convert to specified format. Supports Markdown, HTML, and plain text.', inputSchema: { type: 'object', properties: { url: { type: 'string', description: 'The URL of the webpage to scrape. Must be a valid HTTP/HTTPS link.' }, format: { type: 'string', enum: ['markdown', 'html', 'text'], description: 'Output format: markdown (default), html, text', default: 'markdown' } }, required: ['url'] } },
- src/mcp/server.js:134-136 (registration)Tool dispatch registration in the CallToolRequestHandler switch statement.case 'get_webpage_content': result = await handleGetWebpageContent(args); break;
- Core helper for fetching and converting webpage to Markdown format (primary format). Uses axios, cheerio, turndown.async getWebpageMarkdown(url) { try { const response = await axios.get(url, { headers: { 'User-Agent': this.getRandomUserAgent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }, timeout: 15000 }); const $ = cheerio.load(response.data); // Extract page info const title = $('title').text().trim(); const description = $('meta[name="description"]').attr('content') || ''; // Clean HTML, remove unwanted elements $('script, style, noscript, iframe, img').remove(); $('nav, header, footer, aside').remove(); // Get main content area let mainContent = $('main, article, .content, .main, #content, #main'); if (mainContent.length === 0) { mainContent = $('body'); } // Convert to Markdown const TurndownService = (await import('turndown')).default; const turndownService = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced', emDelimiter: '*', bulletListMarker: '-' }); // Custom rule for links turndownService.addRule('links', { filter: 'a', replacement: function(content, node) { const href = node.getAttribute('href'); const text = content.trim(); if (href && text) { return `[${text}](${href})`; } return content; } }); const markdown = turndownService.turndown(mainContent.html()); logger.info(`Webpage converted to Markdown successfully: ${url}`); return { url, title, description, markdown, // htmlSource: response.data, timestamp: new Date().toISOString() }; } catch (error) { logger.error(`Markdown conversion error for ${url}:`, error); throw new Error(`Failed to convert webpage to Markdown: ${error.message}`); } }
- Core helper for basic webpage scraping (used for html/text formats). Extracts title, desc, content, links using axios/cheerio.async scrapeWebpage(url) { try { const response = await axios.get(url, { headers: { 'User-Agent': this.getRandomUserAgent(), 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }, timeout: 15000 }); const $ = cheerio.load(response.data); // Extract page info const title = $('title').text().trim(); const description = $('meta[name="description"]').attr('content') || ''; const keywords = $('meta[name="keywords"]').attr('content') || ''; // Extract main content const content = $('body').text() .replace(/\s+/g, ' ') .trim() .substring(0, 2000); // limit content length // Extract links const links = []; $('a[href]').each((index, element) => { if (index < 50) { // limit number of links const href = $(element).attr('href'); const text = $(element).text().trim(); if (href && text && href.startsWith('http')) { links.push({ url: href, text }); } } }); logger.info(`Webpage scraped successfully: ${url}`); return { url, title, description, keywords, content, links, timestamp: new Date().toISOString() }; } catch (error) { logger.error(`Webpage scraping error for ${url}:`, error); throw new Error(`Failed to scrape webpage: ${error.message}`); } }