import { z } from 'zod';
import axios from 'axios';
import * as cheerio from 'cheerio';
import axiosRetry from 'axios-retry';
import { BaseTool } from './base.js';
import { logger } from '../utils/logger.js';
// Configure axios with retry logic
const httpClient = axios.create({
timeout: 10000,
maxRedirects: 5,
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
},
});
axiosRetry(httpClient, {
retries: 3,
retryDelay: axiosRetry.exponentialDelay,
retryCondition: (error) => {
return axiosRetry.isNetworkOrIdempotentRequestError(error) || error.response?.status === 429;
},
});
/**
* Extracted content interface
*/
export interface ExtractedContent {
url: string;
text?: string;
headings?: { level: number; text: string }[];
links?: { text: string; href: string }[];
metadata?: Record<string, string>;
images?: { src: string; alt: string }[];
custom?: Record<string, string | string[]>;
}
/**
* WebFetch options schema
*/
const webFetchOptionsSchema = z
.object({
timeout: z.number().int().min(1000).max(60000).optional().describe('Request timeout in milliseconds'),
userAgent: z.string().optional().describe('Custom User-Agent header'),
followRedirects: z.boolean().optional().default(true).describe('Follow HTTP redirects'),
maxRedirects: z.number().int().min(0).max(10).optional().describe('Maximum number of redirects'),
})
.optional();
/**
* WebFetch tool schema
*/
const webFetchSchema = z.object({
url: z.string().url().describe('URL to fetch content from'),
extract: z
.array(z.enum(['text', 'headings', 'links', 'metadata', 'images']))
.optional()
.describe('Types of content to extract (default: all)'),
selectors: z
.record(z.string())
.optional()
.describe('Custom CSS selectors to extract (key: name, value: selector)'),
options: webFetchOptionsSchema,
});
type WebFetchParams = z.infer<typeof webFetchSchema>;
/**
* WebFetchTool - Fetch and parse HTML content from URLs
*/
export class WebFetchTool extends BaseTool<typeof webFetchSchema> {
readonly name = 'webfetch';
readonly description =
'Fetch and parse HTML content from any URL. Extract text, headings, links, metadata, images, or use custom CSS selectors. Supports timeout configuration, custom user-agent, and redirect handling.';
readonly schema = webFetchSchema;
protected async execute(params: WebFetchParams): Promise<ExtractedContent> {
logger.info(`Fetching content from URL`, { url: params.url });
try {
// Configure request options
const config: any = {
timeout: params.options?.timeout || 10000,
maxRedirects: params.options?.maxRedirects ?? 5,
validateStatus: (status: number) => status >= 200 && status < 400,
};
if (params.options?.userAgent) {
config.headers = { 'User-Agent': params.options.userAgent };
}
if (params.options?.followRedirects === false) {
config.maxRedirects = 0;
}
// Fetch the HTML content
const response = await httpClient.get(params.url, config);
const html = response.data;
// Load HTML into Cheerio
const $ = cheerio.load(html);
// Determine what to extract
const extractAll = !params.extract || params.extract.length === 0;
const shouldExtract = (type: string) => extractAll || params.extract?.includes(type as any);
const result: ExtractedContent = {
url: params.url,
};
// Extract text content
if (shouldExtract('text')) {
result.text = this.extractText($);
}
// Extract headings
if (shouldExtract('headings')) {
result.headings = this.extractHeadings($);
}
// Extract links
if (shouldExtract('links')) {
result.links = this.extractLinks($, params.url);
}
// Extract metadata
if (shouldExtract('metadata')) {
result.metadata = this.extractMetadata($);
}
// Extract images
if (shouldExtract('images')) {
result.images = this.extractImages($, params.url);
}
// Extract custom selectors
if (params.selectors) {
result.custom = this.extractCustomSelectors($, params.selectors);
}
logger.info(`Successfully fetched and parsed content`, { url: params.url });
return result;
} catch (error) {
if (axios.isAxiosError(error)) {
if (error.code === 'ECONNABORTED') {
throw new Error(`Request timeout: ${params.url}`);
} else if (error.response) {
throw new Error(
`HTTP ${error.response.status}: ${error.response.statusText} - ${params.url}`
);
} else if (error.request) {
throw new Error(`Network error: Unable to reach ${params.url}`);
}
}
throw new Error(`Failed to fetch content: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Extract text content from the page
*/
private extractText($: ReturnType<typeof cheerio.load>): string {
// Remove script and style elements
$('script, style, noscript').remove();
// Get text from body
const text = $('body').text();
// Clean up whitespace
return text
.split('\n')
.map((line) => line.trim())
.filter((line) => line.length > 0)
.join('\n');
}
/**
* Extract headings (h1-h6)
*/
private extractHeadings($: ReturnType<typeof cheerio.load>): { level: number; text: string }[] {
const headings: { level: number; text: string }[] = [];
for (let level = 1; level <= 6; level++) {
$(`h${level}`).each((_, element) => {
const text = $(element).text().trim();
if (text) {
headings.push({ level, text });
}
});
}
return headings;
}
/**
* Extract links with their text and href
*/
private extractLinks($: ReturnType<typeof cheerio.load>, baseUrl: string): { text: string; href: string }[] {
const links: { text: string; href: string }[] = [];
$('a[href]').each((_, element) => {
const $link = $(element);
const text = $link.text().trim();
const href = $link.attr('href');
if (href) {
try {
// Convert relative URLs to absolute
const absoluteUrl = new URL(href, baseUrl).href;
links.push({ text, href: absoluteUrl });
} catch {
// If URL parsing fails, use the original href
links.push({ text, href });
}
}
});
return links;
}
/**
* Extract metadata from meta tags and title
*/
private extractMetadata($: ReturnType<typeof cheerio.load>): Record<string, string> {
const metadata: Record<string, string> = {};
// Extract title
const title = $('title').text().trim();
if (title) {
metadata.title = title;
}
// Extract meta tags
$('meta').each((_, element) => {
const $meta = $(element);
const name = $meta.attr('name') || $meta.attr('property');
const content = $meta.attr('content');
if (name && content) {
metadata[name] = content;
}
});
// Extract canonical URL
const canonical = $('link[rel="canonical"]').attr('href');
if (canonical) {
metadata.canonical = canonical;
}
return metadata;
}
/**
* Extract images with their src and alt attributes
*/
private extractImages($: ReturnType<typeof cheerio.load>, baseUrl: string): { src: string; alt: string }[] {
const images: { src: string; alt: string }[] = [];
$('img[src]').each((_, element) => {
const $img = $(element);
const src = $img.attr('src');
const alt = $img.attr('alt') || '';
if (src) {
try {
// Convert relative URLs to absolute
const absoluteUrl = new URL(src, baseUrl).href;
images.push({ src: absoluteUrl, alt });
} catch {
// If URL parsing fails, use the original src
images.push({ src, alt });
}
}
});
return images;
}
/**
* Extract content using custom CSS selectors
*/
private extractCustomSelectors(
$: ReturnType<typeof cheerio.load>,
selectors: Record<string, string>
): Record<string, string | string[]> {
const results: Record<string, string | string[]> = {};
for (const [name, selector] of Object.entries(selectors)) {
try {
const elements = $(selector);
if (elements.length === 0) {
results[name] = '';
} else if (elements.length === 1) {
results[name] = elements.text().trim();
} else {
// Multiple elements - return array
const values: string[] = [];
elements.each((_, element) => {
const text = $(element).text().trim();
if (text) {
values.push(text);
}
});
results[name] = values;
}
} catch (error) {
logger.warn(`Failed to extract selector: ${selector}`, { error });
results[name] = '';
}
}
return results;
}
}