Skip to main content
Glama
brendon92

Specialized AI Search Tools

by brendon92
webfetch.ts10.5 kB
import { z } from 'zod'; import axios from 'axios'; import * as cheerio from 'cheerio'; import axiosRetry from 'axios-retry'; import { BaseTool } from './base.js'; import { logger } from '../utils/logger.js'; // Configure axios with retry logic const httpClient = axios.create({ timeout: 10000, maxRedirects: 5, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', }, }); axiosRetry(httpClient, { retries: 3, retryDelay: axiosRetry.exponentialDelay, retryCondition: (error) => { return axiosRetry.isNetworkOrIdempotentRequestError(error) || error.response?.status === 429; }, }); /** * Extracted content interface */ export interface ExtractedContent { url: string; text?: string; headings?: { level: number; text: string }[]; links?: { text: string; href: string }[]; metadata?: Record<string, string>; images?: { src: string; alt: string }[]; custom?: Record<string, string | string[]>; } /** * WebFetch options schema */ const webFetchOptionsSchema = z .object({ timeout: z.number().int().min(1000).max(60000).optional().describe('Request timeout in milliseconds'), userAgent: z.string().optional().describe('Custom User-Agent header'), followRedirects: z.boolean().optional().default(true).describe('Follow HTTP redirects'), maxRedirects: z.number().int().min(0).max(10).optional().describe('Maximum number of redirects'), }) .optional(); /** * WebFetch tool schema */ const webFetchSchema = z.object({ url: z.string().url().describe('URL to fetch content from'), extract: z .array(z.enum(['text', 'headings', 'links', 'metadata', 'images'])) .optional() .describe('Types of content to extract (default: all)'), selectors: z .record(z.string()) .optional() .describe('Custom CSS selectors to extract (key: name, value: selector)'), options: webFetchOptionsSchema, }); type WebFetchParams = z.infer<typeof webFetchSchema>; /** * WebFetchTool - Fetch and parse HTML content from URLs */ export class WebFetchTool extends BaseTool<typeof webFetchSchema> { readonly name = 'webfetch'; readonly description = 'Fetch and parse HTML content from any URL. Extract text, headings, links, metadata, images, or use custom CSS selectors. Supports timeout configuration, custom user-agent, and redirect handling.'; readonly schema = webFetchSchema; protected async execute(params: WebFetchParams): Promise<ExtractedContent> { logger.info(`Fetching content from URL`, { url: params.url }); try { // Configure request options const config: any = { timeout: params.options?.timeout || 10000, maxRedirects: params.options?.maxRedirects ?? 5, validateStatus: (status: number) => status >= 200 && status < 400, }; if (params.options?.userAgent) { config.headers = { 'User-Agent': params.options.userAgent }; } if (params.options?.followRedirects === false) { config.maxRedirects = 0; } // Fetch the HTML content const response = await httpClient.get(params.url, config); const html = response.data; // Load HTML into Cheerio const $ = cheerio.load(html); // Determine what to extract const extractAll = !params.extract || params.extract.length === 0; const shouldExtract = (type: string) => extractAll || params.extract?.includes(type as any); const result: ExtractedContent = { url: params.url, }; // Extract text content if (shouldExtract('text')) { result.text = this.extractText($); } // Extract headings if (shouldExtract('headings')) { result.headings = this.extractHeadings($); } // Extract links if (shouldExtract('links')) { result.links = this.extractLinks($, params.url); } // Extract metadata if (shouldExtract('metadata')) { result.metadata = this.extractMetadata($); } // Extract images if (shouldExtract('images')) { result.images = this.extractImages($, params.url); } // Extract custom selectors if (params.selectors) { result.custom = this.extractCustomSelectors($, params.selectors); } logger.info(`Successfully fetched and parsed content`, { url: params.url }); return result; } catch (error) { if (axios.isAxiosError(error)) { if (error.code === 'ECONNABORTED') { throw new Error(`Request timeout: ${params.url}`); } else if (error.response) { throw new Error( `HTTP ${error.response.status}: ${error.response.statusText} - ${params.url}` ); } else if (error.request) { throw new Error(`Network error: Unable to reach ${params.url}`); } } throw new Error(`Failed to fetch content: ${error instanceof Error ? error.message : 'Unknown error'}`); } } /** * Extract text content from the page */ private extractText($: ReturnType<typeof cheerio.load>): string { // Remove script and style elements $('script, style, noscript').remove(); // Get text from body const text = $('body').text(); // Clean up whitespace return text .split('\n') .map((line) => line.trim()) .filter((line) => line.length > 0) .join('\n'); } /** * Extract headings (h1-h6) */ private extractHeadings($: ReturnType<typeof cheerio.load>): { level: number; text: string }[] { const headings: { level: number; text: string }[] = []; for (let level = 1; level <= 6; level++) { $(`h${level}`).each((_, element) => { const text = $(element).text().trim(); if (text) { headings.push({ level, text }); } }); } return headings; } /** * Extract links with their text and href */ private extractLinks($: ReturnType<typeof cheerio.load>, baseUrl: string): { text: string; href: string }[] { const links: { text: string; href: string }[] = []; $('a[href]').each((_, element) => { const $link = $(element); const text = $link.text().trim(); const href = $link.attr('href'); if (href) { try { // Convert relative URLs to absolute const absoluteUrl = new URL(href, baseUrl).href; links.push({ text, href: absoluteUrl }); } catch { // If URL parsing fails, use the original href links.push({ text, href }); } } }); return links; } /** * Extract metadata from meta tags and title */ private extractMetadata($: ReturnType<typeof cheerio.load>): Record<string, string> { const metadata: Record<string, string> = {}; // Extract title const title = $('title').text().trim(); if (title) { metadata.title = title; } // Extract meta tags $('meta').each((_, element) => { const $meta = $(element); const name = $meta.attr('name') || $meta.attr('property'); const content = $meta.attr('content'); if (name && content) { metadata[name] = content; } }); // Extract canonical URL const canonical = $('link[rel="canonical"]').attr('href'); if (canonical) { metadata.canonical = canonical; } return metadata; } /** * Extract images with their src and alt attributes */ private extractImages($: ReturnType<typeof cheerio.load>, baseUrl: string): { src: string; alt: string }[] { const images: { src: string; alt: string }[] = []; $('img[src]').each((_, element) => { const $img = $(element); const src = $img.attr('src'); const alt = $img.attr('alt') || ''; if (src) { try { // Convert relative URLs to absolute const absoluteUrl = new URL(src, baseUrl).href; images.push({ src: absoluteUrl, alt }); } catch { // If URL parsing fails, use the original src images.push({ src, alt }); } } }); return images; } /** * Extract content using custom CSS selectors */ private extractCustomSelectors( $: ReturnType<typeof cheerio.load>, selectors: Record<string, string> ): Record<string, string | string[]> { const results: Record<string, string | string[]> = {}; for (const [name, selector] of Object.entries(selectors)) { try { const elements = $(selector); if (elements.length === 0) { results[name] = ''; } else if (elements.length === 1) { results[name] = elements.text().trim(); } else { // Multiple elements - return array const values: string[] = []; elements.each((_, element) => { const text = $(element).text().trim(); if (text) { values.push(text); } }); results[name] = values; } } catch (error) { logger.warn(`Failed to extract selector: ${selector}`, { error }); results[name] = ''; } } return results; } }

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/brendon92/mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server