visit_page

Name	Required	Description	Default
`url`	Yes	URL to visit
`takeScreenshot`	No	Whether to take a screenshot

index.ts:959-1053 (handler)

Main handler logic for the 'visit_page' tool: validates URL, navigates to page, extracts markdown content using helpers, optionally takes and saves screenshot, stores result in session, returns JSON with content and screenshot URI.

case "visit_page": { // Extract URL and screenshot flag from request const { url, takeScreenshot } = request.params.arguments as { url: string; // Target URL to visit takeScreenshot?: boolean; // Optional screenshot flag }; // Step 1: Validate URL format and security if (!isValidUrl(url)) { return { content: [{ type: "text" as const, text: `Invalid URL: ${url}. Only http and https protocols are supported.` }], isError: true }; } try { // Step 2: Visit page and extract content with retry mechanism const result = await withRetry(async () => { // Navigate to target URL safely await safePageNavigation(page, url); const title = await page.title(); // Step 3: Extract and process page content const content = await withRetry(async () => { // Convert page content to markdown const extractedContent = await extractContentAsMarkdown(page); // If no content is extracted, throw an error if (!extractedContent) { throw new Error('Failed to extract content'); } // Return the extracted content return extractedContent; }); // Step 4: Create result object with page data const pageResult: ResearchResult = { url, // Original URL title, // Page title content, // Markdown content timestamp: new Date().toISOString(), // Capture time }; // Step 5: Take screenshot if requested let screenshotUri: string | undefined; if (takeScreenshot) { // Capture and process screenshot const screenshot = await takeScreenshotWithSizeLimit(page); pageResult.screenshotPath = await saveScreenshot(screenshot, title); // Get the index for the resource URI const resultIndex = currentSession ? currentSession.results.length : 0; screenshotUri = `research://screenshots/${resultIndex}`; // Notify clients about new screenshot resource server.notification({ method: "notifications/resources/list_changed" }); } // Step 6: Store result in session addResult(pageResult); return { pageResult, screenshotUri }; }); // Step 7: Return formatted result with screenshot URI if taken const response: ToolResult = { content: [{ type: "text" as const, text: JSON.stringify({ url: result.pageResult.url, title: result.pageResult.title, content: result.pageResult.content, timestamp: result.pageResult.timestamp, screenshot: result.screenshotUri ? `View screenshot via *MCP Resources* (Paperclip icon) @ URI: ${result.screenshotUri}` : undefined }, null, 2) }] }; return response; } catch (error) { // Handle and format page visit errors return { content: [{ type: "text" as const, text: `Failed to visit page: ${(error as Error).message}` }], isError: true }; } }

index.ts:144-154 (schema)

Input schema and description for the visit_page tool defined in the TOOLS array.

{ name: "visit_page", description: "Visit a webpage and extract its content", inputSchema: { type: "object", properties: { url: { type: "string", description: "URL to visit" }, takeScreenshot: { type: "boolean", description: "Whether to take a screenshot" }, }, required: ["url"], },

index.ts:580-582 (registration)

Registration of the tools list (including visit_page) via handler for ListToolsRequestSchema.

server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS // Return list of available research tools }));

index.ts:390-476 (helper)

Helper function for safe page navigation used in visit_page, handles consent, validation, bot detection.

async function safePageNavigation(page: Page, url: string): Promise<void> { try { // Step 1: Set cookies to bypass consent banner await page.context().addCookies([{ name: 'CONSENT', value: 'YES+', domain: '.google.com', path: '/' }]); // Step 2: Initial navigation const response = await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 15000 }); // Step 3: Basic response validation if (!response) { throw new Error('Navigation failed: no response received'); } // Check HTTP status code; if 400 or higher, throw an error const status = response.status(); if (status >= 400) { throw new Error(`HTTP ${status}: ${response.statusText()}`); } // Step 4: Wait for network to become idle or timeout await Promise.race([ page.waitForLoadState('networkidle', { timeout: 5000 }) .catch(() => {/* ignore timeout */ }), // Fallback timeout in case networkidle never occurs new Promise(resolve => setTimeout(resolve, 5000)) ]); // Step 5: Security and content validation const validation = await page.evaluate(() => { const botProtectionExists = [ '#challenge-running', // Cloudflare '#cf-challenge-running', // Cloudflare '#px-captcha', // PerimeterX '#ddos-protection', // Various '#waf-challenge-html' // Various WAFs ].some(selector => document.querySelector(selector)); // Check for suspicious page titles const suspiciousTitle = [ 'security check', 'ddos protection', 'please wait', 'just a moment', 'attention required' ].some(phrase => document.title.toLowerCase().includes(phrase)); // Count words in the page content const bodyText = document.body.innerText || ''; const words = bodyText.trim().split(/\s+/).length; // Return validation results return { wordCount: words, botProtection: botProtectionExists, suspiciousTitle, title: document.title }; }); // If bot protection is detected, throw an error if (validation.botProtection) { throw new Error('Bot protection detected'); } // If the page title is suspicious, throw an error if (validation.suspiciousTitle) { throw new Error(`Suspicious page title detected: "${validation.title}"`); } // If the page contains insufficient content, throw an error if (validation.wordCount < 10) { throw new Error('Page contains insufficient content'); } } catch (error) { // If an error occurs during navigation, throw an error with the URL and the error message throw new Error(`Navigation to ${url} failed: ${(error as Error).message}`); } }

index.ts:717-810 (helper)

Core helper for extracting and converting page HTML to clean Markdown, called by visit_page handler.

async function extractContentAsMarkdown( page: Page, // Puppeteer page to extract from selector?: string // Optional CSS selector to target specific content ): Promise<string> { // Step 1: Execute content extraction in browser context const html = await page.evaluate((sel) => { // Handle case where specific selector is provided if (sel) { const element = document.querySelector(sel); // Return element content or empty string if not found return element ? element.outerHTML : ''; } // Step 2: Try standard content containers first const contentSelectors = [ 'main', // HTML5 semantic main content 'article', // HTML5 semantic article content '[role="main"]', // ARIA main content role '#content', // Common content ID '.content', // Common content class '.main', // Alternative main class '.post', // Blog post content '.article', // Article content container ]; // Try each selector in priority order for (const contentSelector of contentSelectors) { const element = document.querySelector(contentSelector); if (element) { return element.outerHTML; // Return first matching content } } // Step 3: Fallback to cleaning full body content const body = document.body; // Define elements to remove for cleaner content const elementsToRemove = [ // Navigation elements 'header', // Page header 'footer', // Page footer 'nav', // Navigation sections '[role="navigation"]', // ARIA navigation elements // Sidebars and complementary content 'aside', // Sidebar content '.sidebar', // Sidebar by class '[role="complementary"]', // ARIA complementary content // Navigation-related elements '.nav', // Navigation classes '.menu', // Menu elements // Page structure elements '.header', // Header classes '.footer', // Footer classes // Advertising and notices '.advertisement', // Advertisement containers '.ads', // Ad containers '.cookie-notice', // Cookie consent notices ]; // Remove each unwanted element from content elementsToRemove.forEach(sel => { body.querySelectorAll(sel).forEach(el => el.remove()); }); // Return cleaned body content return body.outerHTML; }, selector); // Step 4: Handle empty content case if (!html) { return ''; } try { // Step 5: Convert HTML to Markdown const markdown = turndownService.turndown(html); // Step 6: Clean up and format markdown return markdown .replace(/\n{3,}/g, '\n\n') // Replace excessive newlines with double .replace(/^- $/gm, '') // Remove empty list items .replace(/^\s+$/gm, '') // Remove whitespace-only lines .trim(); // Remove leading/trailing whitespace } catch (error) { // Log conversion errors and return original HTML as fallback console.error('Error converting HTML to Markdown:', error); return html; } }

MCP Web Research Server

Instructions

Input Schema

Implementation Reference

Other Tools

Latest Blog Posts

MCP directory API