Fetch Browser

by TheSethRose
Verified
  • src
/** * URL Fetcher Tool * * This module implements an MCP tool for fetching and processing URLs. * It includes features like: * - Proper URL validation and sanitization * - Response type handling (HTML, JSON, text, markdown) * - Special handling for Google search results * - Error handling and retries * - Rate limiting protection * - Security headers and user agent */ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { z } from "zod"; import { Window } from "happy-dom"; import { JSDOM } from "jsdom"; import { writeFileSync } from "fs"; // Constants const MAX_RETRIES = 3; const INITIAL_RETRY_DELAY = 1000; // 1 second const MAX_RESPONSE_SIZE = 10 * 1024 * 1024; // 10MB const DEFAULT_TIMEOUT = 30000; // 30 seconds const GOOGLE_SEARCH_URL = 'https://www.google.com/search'; const MAX_SEARCH_RESULTS = 5; const BROWSER_HEADERS = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Cache-Control': 'no-cache', 'Pragma': 'no-cache', 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': '"macOS"', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1' }; // Schema for URL fetcher parameters const UrlFetcherSchema = z.object({ url: z.string() .url() .transform(url => new URL(url)) .describe("The URL to fetch"), responseType: z.enum(['text', 'json', 'html', 'markdown']) .default('text') .describe("Expected response type"), timeout: z.number() .min(1000) .max(60000) .default(DEFAULT_TIMEOUT) .describe("Request timeout in milliseconds") }); interface FetchUrlParams { url: URL; responseType?: 'json' | 'markdown'; timeout?: number; } /** * Helper function for exponential backoff */ function getRetryDelay(attempt: number): number { return INITIAL_RETRY_DELAY * Math.pow(2, attempt); } /** * Extract Google search results from HTML. * * Depending on the content, it looks for a news structure (using the * [data-news-cluster-id] attribute) or falls back to general result elements (using the * ".g" selector). This mimics the extraction logic seen in the local-web-search repo. * * @param html - The HTML string to parse. * @param responseType - The desired output format. * @returns The results in the chosen format. */ function extractGoogleResults(html: string, responseType: 'text' | 'json' | 'html' | 'markdown'): string | object[] { const dom = new Window({ settings: { disableJavaScriptFileLoading: true, disableJavaScriptEvaluation: true, disableCSSFileLoading: true, timer: { maxTimeout: 3000, maxIntervalTime: 3000, }, }, }); const document = dom.document; document.write(html); console.log(`Document body length: ${document.body.innerHTML.length}`); const results: { title: string; url: string; description?: string }[] = []; // Try news results first const newsElements = document.querySelectorAll('[data-news-cluster-id]'); console.log(`Found ${newsElements.length} news elements`); newsElements.forEach((element, index) => { console.log(`Processing news element ${index + 1}`); const titleEl = element.querySelector('[role="heading"]'); const linkEl = element.querySelector('a'); const snippetEl = titleEl?.nextElementSibling; if (titleEl && linkEl) { const title = titleEl.textContent?.trim(); const url = linkEl.getAttribute('href'); const description = snippetEl?.textContent?.trim(); if (title && url) { results.push({ title, url, description }); } else { console.log(`Missing title or URL for news element ${index + 1}`); } } }); // If no news results, try general search results if (results.length === 0) { const generalElements = document.querySelectorAll('.g'); console.log(`Found ${generalElements.length} general result elements`); generalElements.forEach((element, index) => { console.log(`Processing general element ${index + 1}`); const titleEl = element.querySelector('h3'); const linkEl = element.querySelector('a'); const snippetEl = element.querySelector('.VwiC3b'); if (titleEl && linkEl) { const title = titleEl.textContent?.trim(); const url = linkEl.getAttribute('href'); const description = snippetEl?.textContent?.trim(); if (title && url) { results.push({ title, url, description }); } else { console.log(`Missing title or URL for general element ${index + 1}`); } } }); } // If still no results, try alternative selectors if (results.length === 0) { console.log('No results found with primary selectors, trying alternatives...'); const alternativeElements = document.querySelectorAll('div.tF2Cxc'); alternativeElements.forEach((element, index) => { console.log(`Processing alternative element ${index + 1}`); const titleEl = element.querySelector('h3'); const linkEl = element.querySelector('a'); const snippetEl = element.querySelector('.VwiC3b'); if (titleEl && linkEl) { const title = titleEl.textContent?.trim(); const url = linkEl.getAttribute('href'); const description = snippetEl?.textContent?.trim(); if (title && url) { results.push({ title, url, description }); } else { console.log(`Missing title or URL for alternative element ${index + 1}`); } } }); } console.log(`Total results found: ${results.length}`); dom.happyDOM?.close(); switch (responseType) { case 'markdown': return results.map(r => `- [${r.title}](${r.url})${r.description ? `\n ${r.description}` : ''}`).join('\n'); case 'html': return results.map(r => `<div class="result"><h3><a href="${r.url}">${r.title}</a></h3>${r.description ? `<p>${r.description}</p>` : ''}</div>`).join('\n'); case 'text': return results.map(r => `${r.title}\n${r.url}${r.description ? `\n${r.description}` : ''}`).join('\n\n'); case 'json': default: return results; } } /** * Convert HTML to Markdown */ function htmlToMarkdown(html: string): string { // Basic HTML to Markdown conversion return html // Headers .replace(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi, (_, content) => `\n# ${content.trim()}\n`) // Bold .replace(/<(strong|b)>(.*?)<\/\1>/gi, '**$2**') // Italic .replace(/<(em|i)>(.*?)<\/\1>/gi, '*$2*') // Links .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)') // Lists .replace(/<(ul|ol)[^>]*>(.*?)<\/\1>/gi, (_, type, content) => { return content.replace(/<li[^>]*>(.*?)<\/li>/gi, type === 'ul' ? '- $1\n' : '1. $1\n' ); }) // Paragraphs .replace(/<p[^>]*>(.*?)<\/p>/gi, '\n$1\n') // Remove remaining HTML tags .replace(/<[^>]*>/g, '') // Fix spacing .replace(/\n\s*\n/g, '\n\n') .trim(); } /** * Process response based on type */ async function processResponse(response: Response, responseType: 'text' | 'json' | 'html' | 'markdown', url: URL): Promise<string> { const contentType = response.headers.get('content-type') || ''; // Check response size const contentLength = parseInt(response.headers.get('content-length') || '0'); if (contentLength > MAX_RESPONSE_SIZE) { throw new Error('Response too large'); } let text = await response.text(); // Special handling for Google search results if (url.origin + url.pathname === GOOGLE_SEARCH_URL) { const mappedType = responseType === 'json' || responseType === 'markdown' ? responseType : 'json'; const results = await extractGoogleResults(text, mappedType); return typeof results === 'string' ? results : JSON.stringify(results, null, 2); } switch (responseType) { case 'json': if (!contentType.includes('application/json')) { throw new Error('Response is not JSON'); } // Pretty print JSON return JSON.stringify(JSON.parse(text), null, 2); case 'html': if (!contentType.includes('text/html')) { throw new Error('Response is not HTML'); } return text; case 'markdown': if (contentType.includes('text/html')) { return htmlToMarkdown(text); } else if (contentType.includes('text/markdown')) { return text; } // If not HTML or Markdown, convert plain text to markdown return `\`\`\`\n${text}\n\`\`\``; case 'text': default: return text; } } /** * Register the URL fetcher tool with the MCP server */ export function registerUrlFetcherTool(server: McpServer) { server.tool( "fetch_url", "Fetch content from a URL with proper error handling and response processing", UrlFetcherSchema.shape, async (params) => { for (let attempt = 0; attempt < MAX_RETRIES; attempt++) { try { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), params.timeout); const response = await fetch(params.url.toString(), { signal: controller.signal, headers: BROWSER_HEADERS, redirect: 'follow' }); clearTimeout(timeout); // Handle different status codes if (!response.ok) { if (response.status === 429) { if (attempt === MAX_RETRIES - 1) { return { content: [{ type: "text", text: "Rate limit exceeded. Please try again later." }], isError: true }; } await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt))); continue; } return { content: [{ type: "text", text: `HTTP ${response.status}: ${response.statusText}` }], isError: true }; } // Process the response const processedContent = await processResponse(response, params.responseType, params.url); // Always return as text type with appropriate metadata return { content: [{ type: "text", text: processedContent, mimeType: params.responseType === 'json' ? 'application/json' : params.responseType === 'markdown' ? 'text/markdown' : params.responseType === 'html' ? 'text/html' : 'text/plain' }], metadata: { url: params.url.toString(), contentType: response.headers.get('content-type'), contentLength: response.headers.get('content-length'), isGoogleSearch: params.url.origin + params.url.pathname === GOOGLE_SEARCH_URL, responseType: params.responseType } }; } catch (error) { if (error instanceof Error && error.name === 'AbortError') { return { content: [{ type: "text", text: `Request timed out after ${params.timeout}ms` }], isError: true }; } if (attempt === MAX_RETRIES - 1) { return { content: [{ type: "text", text: `Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}` }], isError: true }; } await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt))); } } return { content: [{ type: "text", text: "Failed to fetch URL after all retry attempts" }], isError: true }; } ); } export async function fetchUrl(url: string, responseType: 'text' | 'json' | 'html' | 'markdown' = 'json'): Promise<string | object[]> { try { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), 5000); const response = await fetch(url, { signal: controller.signal, headers: BROWSER_HEADERS, redirect: 'follow' }); clearTimeout(timeout); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } const html = await response.text(); writeFileSync('fetchedPage.html', html); console.log(`Saved fetched HTML (${html.length} bytes) to fetchedPage.html`); if (url.startsWith(GOOGLE_SEARCH_URL)) { return extractGoogleResults(html, responseType); } switch (responseType) { case 'markdown': return htmlToMarkdown(html); case 'html': return html; case 'text': return html; case 'json': default: return [{ content: html }]; } } catch (error) { if (error instanceof Error) { throw new Error(`Failed to fetch URL: ${error.message}`); } throw error; } } export async function fetchUrlWithParams(params: FetchUrlParams): Promise<string | object[]> { return await z.object({ url: z.instanceof(URL), responseType: z.enum(['json', 'markdown']).default('json'), timeout: z.number().min(1000).max(30000).default(5000) }).parseAsync(params).then( async (params) => { try { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), params.timeout); const response = await fetch(params.url.toString(), { signal: controller.signal, headers: BROWSER_HEADERS, redirect: 'follow' }); clearTimeout(timeout); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } const html = await response.text(); if (params.url.origin + params.url.pathname === GOOGLE_SEARCH_URL) { return extractGoogleResults(html, params.responseType); } return params.responseType === 'markdown' ? html : [{ content: html }]; } catch (error) { if (error instanceof Error) { throw new Error(`Failed to fetch URL: ${error.message}`); } throw error; } } ); }