url-fetcher.ts•14.7 kB
/**
 * URL Fetcher Tool
 *
 * This module implements an MCP tool for fetching and processing URLs.
 * It includes features like:
 * - Proper URL validation and sanitization
 * - Response type handling (HTML, JSON, text, markdown)
 * - Special handling for Google search results
 * - Error handling and retries
 * - Rate limiting protection
 * - Security headers and user agent
 */
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { z } from "zod";
import { Window } from "happy-dom";
import { JSDOM } from "jsdom";
import { writeFileSync } from "fs";
// Constants
const MAX_RETRIES = 3;
const INITIAL_RETRY_DELAY = 1000; // 1 second
const MAX_RESPONSE_SIZE = 10 * 1024 * 1024; // 10MB
const DEFAULT_TIMEOUT = 30000; // 30 seconds
const GOOGLE_SEARCH_URL = 'https://www.google.com/search';
const MAX_SEARCH_RESULTS = 5;
const BROWSER_HEADERS = {
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
  'Accept-Language': 'en-US,en;q=0.9',
  'Accept-Encoding': 'gzip, deflate, br',
  'Cache-Control': 'no-cache',
  'Pragma': 'no-cache',
  'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
  'Sec-Ch-Ua-Mobile': '?0',
  'Sec-Ch-Ua-Platform': '"macOS"',
  'Sec-Fetch-Dest': 'document',
  'Sec-Fetch-Mode': 'navigate',
  'Sec-Fetch-Site': 'none',
  'Sec-Fetch-User': '?1',
  'Upgrade-Insecure-Requests': '1'
};
// Schema for URL fetcher parameters
const UrlFetcherSchema = z.object({
  url: z.string()
    .url()
    .transform(url => new URL(url))
    .describe("The URL to fetch"),
  responseType: z.enum(['text', 'json', 'html', 'markdown'])
    .default('text')
    .describe("Expected response type"),
  timeout: z.number()
    .min(1000)
    .max(60000)
    .default(DEFAULT_TIMEOUT)
    .describe("Request timeout in milliseconds")
});
interface FetchUrlParams {
  url: URL;
  responseType?: 'json' | 'markdown';
  timeout?: number;
}
/**
 * Helper function for exponential backoff
 */
function getRetryDelay(attempt: number): number {
  return INITIAL_RETRY_DELAY * Math.pow(2, attempt);
}
/**
 * Extract Google search results from HTML.
 *
 * Depending on the content, it looks for a news structure (using the
 * [data-news-cluster-id] attribute) or falls back to general result elements (using the
 * ".g" selector). This mimics the extraction logic seen in the local-web-search repo.
 *
 * @param html - The HTML string to parse.
 * @param responseType - The desired output format.
 * @returns The results in the chosen format.
 */
function extractGoogleResults(html: string, responseType: 'text' | 'json' | 'html' | 'markdown'): string | object[] {
  const dom = new Window({
    settings: {
      disableJavaScriptFileLoading: true,
      disableJavaScriptEvaluation: true,
      disableCSSFileLoading: true,
      timer: {
        maxTimeout: 3000,
        maxIntervalTime: 3000,
      },
    },
  });
  const document = dom.document;
  document.write(html);
  console.log(`Document body length: ${document.body.innerHTML.length}`);
  const results: { title: string; url: string; description?: string }[] = [];
  // Try news results first
  const newsElements = document.querySelectorAll('[data-news-cluster-id]');
  console.log(`Found ${newsElements.length} news elements`);
  newsElements.forEach((element, index) => {
    console.log(`Processing news element ${index + 1}`);
    const titleEl = element.querySelector('[role="heading"]');
    const linkEl = element.querySelector('a');
    const snippetEl = titleEl?.nextElementSibling;
    if (titleEl && linkEl) {
      const title = titleEl.textContent?.trim();
      const url = linkEl.getAttribute('href');
      const description = snippetEl?.textContent?.trim();
      if (title && url) {
        results.push({ title, url, description });
      } else {
        console.log(`Missing title or URL for news element ${index + 1}`);
      }
    }
  });
  // If no news results, try general search results
  if (results.length === 0) {
    const generalElements = document.querySelectorAll('.g');
    console.log(`Found ${generalElements.length} general result elements`);
    generalElements.forEach((element, index) => {
      console.log(`Processing general element ${index + 1}`);
      const titleEl = element.querySelector('h3');
      const linkEl = element.querySelector('a');
      const snippetEl = element.querySelector('.VwiC3b');
      if (titleEl && linkEl) {
        const title = titleEl.textContent?.trim();
        const url = linkEl.getAttribute('href');
        const description = snippetEl?.textContent?.trim();
        if (title && url) {
          results.push({ title, url, description });
        } else {
          console.log(`Missing title or URL for general element ${index + 1}`);
        }
      }
    });
  }
  // If still no results, try alternative selectors
  if (results.length === 0) {
    console.log('No results found with primary selectors, trying alternatives...');
    const alternativeElements = document.querySelectorAll('div.tF2Cxc');
    alternativeElements.forEach((element, index) => {
      console.log(`Processing alternative element ${index + 1}`);
      const titleEl = element.querySelector('h3');
      const linkEl = element.querySelector('a');
      const snippetEl = element.querySelector('.VwiC3b');
      if (titleEl && linkEl) {
        const title = titleEl.textContent?.trim();
        const url = linkEl.getAttribute('href');
        const description = snippetEl?.textContent?.trim();
        if (title && url) {
          results.push({ title, url, description });
        } else {
          console.log(`Missing title or URL for alternative element ${index + 1}`);
        }
      }
    });
  }
  console.log(`Total results found: ${results.length}`);
  dom.happyDOM?.close();
  switch (responseType) {
    case 'markdown':
      return results.map(r => `- [${r.title}](${r.url})${r.description ? `\n  ${r.description}` : ''}`).join('\n');
    case 'html':
      return results.map(r => `<div class="result"><h3><a href="${r.url}">${r.title}</a></h3>${r.description ? `<p>${r.description}</p>` : ''}</div>`).join('\n');
    case 'text':
      return results.map(r => `${r.title}\n${r.url}${r.description ? `\n${r.description}` : ''}`).join('\n\n');
    case 'json':
    default:
      return results;
  }
}
/**
 * Convert HTML to Markdown
 */
function htmlToMarkdown(html: string): string {
  // Basic HTML to Markdown conversion
  return html
    // Headers
    .replace(/<h[1-6][^>]*>(.*?)<\/h[1-6]>/gi, (_, content) => `\n# ${content.trim()}\n`)
    // Bold
    .replace(/<(strong|b)>(.*?)<\/\1>/gi, '**$2**')
    // Italic
    .replace(/<(em|i)>(.*?)<\/\1>/gi, '*$2*')
    // Links
    .replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)')
    // Lists
    .replace(/<(ul|ol)[^>]*>(.*?)<\/\1>/gi, (_, type, content) => {
      return content.replace(/<li[^>]*>(.*?)<\/li>/gi,
        type === 'ul' ? '- $1\n' : '1. $1\n'
      );
    })
    // Paragraphs
    .replace(/<p[^>]*>(.*?)<\/p>/gi, '\n$1\n')
    // Remove remaining HTML tags
    .replace(/<[^>]*>/g, '')
    // Fix spacing
    .replace(/\n\s*\n/g, '\n\n')
    .trim();
}
/**
 * Process response based on type
 */
async function processResponse(response: Response, responseType: 'text' | 'json' | 'html' | 'markdown', url: URL): Promise<string> {
  const contentType = response.headers.get('content-type') || '';
  // Check response size
  const contentLength = parseInt(response.headers.get('content-length') || '0');
  if (contentLength > MAX_RESPONSE_SIZE) {
    throw new Error('Response too large');
  }
  let text = await response.text();
  // Special handling for Google search results
  if (url.origin + url.pathname === GOOGLE_SEARCH_URL) {
    const mappedType = responseType === 'json' || responseType === 'markdown' ? responseType : 'json';
    const results = await extractGoogleResults(text, mappedType);
    return typeof results === 'string' ? results : JSON.stringify(results, null, 2);
  }
  switch (responseType) {
    case 'json':
      if (!contentType.includes('application/json')) {
        throw new Error('Response is not JSON');
      }
      // Pretty print JSON
      return JSON.stringify(JSON.parse(text), null, 2);
    case 'html':
      if (!contentType.includes('text/html')) {
        throw new Error('Response is not HTML');
      }
      return text;
    case 'markdown':
      if (contentType.includes('text/html')) {
        return htmlToMarkdown(text);
      } else if (contentType.includes('text/markdown')) {
        return text;
      }
      // If not HTML or Markdown, convert plain text to markdown
      return `\`\`\`\n${text}\n\`\`\``;
    case 'text':
    default:
      return text;
  }
}
/**
 * Register the URL fetcher tool with the MCP server
 */
export function registerUrlFetcherTool(server: McpServer) {
  server.tool(
    "fetch_url",
    "Fetch content from a URL with proper error handling and response processing",
    UrlFetcherSchema.shape,
    async (params) => {
      for (let attempt = 0; attempt < MAX_RETRIES; attempt++) {
        try {
          const controller = new AbortController();
          const timeout = setTimeout(() => controller.abort(), params.timeout);
          const response = await fetch(params.url.toString(), {
            signal: controller.signal,
            headers: BROWSER_HEADERS,
            redirect: 'follow'
          });
          clearTimeout(timeout);
          // Handle different status codes
          if (!response.ok) {
            if (response.status === 429) {
              if (attempt === MAX_RETRIES - 1) {
                return {
                  content: [{
                    type: "text",
                    text: "Rate limit exceeded. Please try again later."
                  }],
                  isError: true
                };
              }
              await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt)));
              continue;
            }
            return {
              content: [{
                type: "text",
                text: `HTTP ${response.status}: ${response.statusText}`
              }],
              isError: true
            };
          }
          // Process the response
          const processedContent = await processResponse(response, params.responseType, params.url);
          // Always return as text type with appropriate metadata
          return {
            content: [{
              type: "text",
              text: processedContent,
              mimeType: params.responseType === 'json' ? 'application/json' :
                params.responseType === 'markdown' ? 'text/markdown' :
                  params.responseType === 'html' ? 'text/html' : 'text/plain'
            }],
            metadata: {
              url: params.url.toString(),
              contentType: response.headers.get('content-type'),
              contentLength: response.headers.get('content-length'),
              isGoogleSearch: params.url.origin + params.url.pathname === GOOGLE_SEARCH_URL,
              responseType: params.responseType
            }
          };
        } catch (error) {
          if (error instanceof Error && error.name === 'AbortError') {
            return {
              content: [{
                type: "text",
                text: `Request timed out after ${params.timeout}ms`
              }],
              isError: true
            };
          }
          if (attempt === MAX_RETRIES - 1) {
            return {
              content: [{
                type: "text",
                text: `Failed to fetch URL: ${error instanceof Error ? error.message : 'Unknown error'}`
              }],
              isError: true
            };
          }
          await new Promise(resolve => setTimeout(resolve, getRetryDelay(attempt)));
        }
      }
      return {
        content: [{
          type: "text",
          text: "Failed to fetch URL after all retry attempts"
        }],
        isError: true
      };
    }
  );
}
export async function fetchUrl(url: string, responseType: 'text' | 'json' | 'html' | 'markdown' = 'json'): Promise<string | object[]> {
  try {
    const controller = new AbortController();
    const timeout = setTimeout(() => controller.abort(), 5000);
    const response = await fetch(url, {
      signal: controller.signal,
      headers: BROWSER_HEADERS,
      redirect: 'follow'
    });
    clearTimeout(timeout);
    if (!response.ok) {
      throw new Error(`HTTP error! status: ${response.status}`);
    }
    const html = await response.text();
    writeFileSync('fetchedPage.html', html);
    console.log(`Saved fetched HTML (${html.length} bytes) to fetchedPage.html`);
    if (url.startsWith(GOOGLE_SEARCH_URL)) {
      return extractGoogleResults(html, responseType);
    }
    switch (responseType) {
      case 'markdown':
        return htmlToMarkdown(html);
      case 'html':
        return html;
      case 'text':
        return html;
      case 'json':
      default:
        return [{ content: html }];
    }
  } catch (error) {
    if (error instanceof Error) {
      throw new Error(`Failed to fetch URL: ${error.message}`);
    }
    throw error;
  }
}
export async function fetchUrlWithParams(params: FetchUrlParams): Promise<string | object[]> {
  return await z.object({
    url: z.instanceof(URL),
    responseType: z.enum(['json', 'markdown']).default('json'),
    timeout: z.number().min(1000).max(30000).default(5000)
  }).parseAsync(params).then(
    async (params) => {
      try {
        const controller = new AbortController();
        const timeout = setTimeout(() => controller.abort(), params.timeout);
        const response = await fetch(params.url.toString(), {
          signal: controller.signal,
          headers: BROWSER_HEADERS,
          redirect: 'follow'
        });
        clearTimeout(timeout);
        if (!response.ok) {
          throw new Error(`HTTP error! status: ${response.status}`);
        }
        const html = await response.text();
        if (params.url.origin + params.url.pathname === GOOGLE_SEARCH_URL) {
          return extractGoogleResults(html, params.responseType);
        }
        return params.responseType === 'markdown' ? html : [{ content: html }];
      } catch (error) {
        if (error instanceof Error) {
          throw new Error(`Failed to fetch URL: ${error.message}`);
        }
        throw error;
      }
    }
  );
}