web_url_read
Retrieve and extract content from a web page URL. Options allow focusing on specific sections, paragraphs, headings, or character ranges.
Instructions
Read the content from an URL. Use this for further information retrieving to understand the content of each URL.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | URL | |
| startChar | No | Starting character position for content extraction (default: 0) | |
| maxLength | No | Maximum number of characters to return | |
| section | No | Extract content under a specific heading (searches for heading text) | |
| paragraphRange | No | Return specific paragraph ranges (e.g., '1-5', '3', '10-') | |
| readHeadings | No | Return only a list of headings instead of full content |
Implementation Reference
- src/types.ts:70-72 (registration)Defines the 'web_url_read' tool with its name, description, and input schema (url required, with optional pagination options: startChar, maxLength, section, paragraphRange, readHeadings).
}; export const READ_URL_TOOL: Tool = { - src/url-reader.ts:202-340 (handler)Core handler function that fetches a URL, validates it (format, security policy for private IPs/hostnames), converts HTML to Markdown using NodeHtmlMarkdown, applies pagination options (character range, section, paragraph range, headings only), and caches results.
export async function fetchAndConvertToMarkdown( mcpServer: McpServer, url: string, timeoutMs: number = 10000, paginationOptions: PaginationOptions = {} ) { const startTime = Date.now(); logMessage(mcpServer, "info", `Fetching URL: ${url}`); // Check cache first const cachedEntry = urlCache.get(url); if (cachedEntry) { logMessage(mcpServer, "info", `Using cached content for URL: ${url}`); const result = applyPaginationOptions(cachedEntry.markdownContent, paginationOptions); const duration = Date.now() - startTime; logMessage(mcpServer, "info", `Processed cached URL: ${url} (${result.length} chars in ${duration}ms)`); return result; } // Validate URL format let parsedUrl: URL; try { parsedUrl = new URL(url); } catch (error) { logMessage(mcpServer, "error", `Invalid URL format: ${url}`); throw createURLFormatError(url); } assertUrlAllowed(parsedUrl); // Create an AbortController instance const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeoutMs); try { // Prepare request options with proxy support const requestOptions: RequestInit = { signal: controller.signal, }; // Add proxy or default dispatcher (includes system CA certs for TLS) const proxyAgent = createProxyAgent(url, ProxyType.URL_READER); const dispatcher = proxyAgent ?? createDefaultAgent(); if (dispatcher) { (requestOptions as any).dispatcher = dispatcher; } // Add User-Agent header if configured (URL_READER_USER_AGENT takes priority over USER_AGENT) const userAgent = process.env.URL_READER_USER_AGENT || process.env.USER_AGENT; if (userAgent) { requestOptions.headers = { ...requestOptions.headers, 'User-Agent': userAgent }; } let response: Response; try { // Fetch the URL with the abort signal response = await fetch(url, requestOptions); } catch (error: any) { const context: ErrorContext = { url, proxyAgent: !!dispatcher, timeout: timeoutMs }; throw createNetworkError(error, context); } if (!response.ok) { let responseBody: string; try { responseBody = await response.text(); } catch { responseBody = '[Could not read response body]'; } const context: ErrorContext = { url }; throw createServerError(response.status, response.statusText, responseBody, context); } // Retrieve HTML content let htmlContent: string; try { htmlContent = await response.text(); } catch (error: any) { throw createContentError( `Failed to read website content: ${error.message || 'Unknown error reading content'}`, url ); } if (!htmlContent || htmlContent.trim().length === 0) { throw createContentError("Website returned empty content.", url); } // Convert HTML to Markdown let markdownContent: string; try { markdownContent = NodeHtmlMarkdown.translate(htmlContent); } catch (error: any) { throw createConversionError(error, url, htmlContent); } if (!markdownContent || markdownContent.trim().length === 0) { logMessage(mcpServer, "warning", `Empty content after conversion: ${url}`); // DON'T cache empty/failed conversions - return warning directly return createEmptyContentWarning(url, htmlContent.length, htmlContent); } // Only cache successful markdown conversion urlCache.set(url, htmlContent, markdownContent); // Apply pagination options const result = applyPaginationOptions(markdownContent, paginationOptions); const duration = Date.now() - startTime; logMessage(mcpServer, "info", `Successfully fetched and converted URL: ${url} (${result.length} chars in ${duration}ms)`); return result; } catch (error: any) { if (error.name === "AbortError") { logMessage(mcpServer, "error", `Timeout fetching URL: ${url} (${timeoutMs}ms)`); throw createTimeoutError(timeoutMs, url); } // Re-throw our enhanced errors if (error.name === 'MCPSearXNGError') { logMessage(mcpServer, "error", `Error fetching URL: ${url} - ${error.message}`); throw error; } // Catch any unexpected errors logMessage(mcpServer, "error", `Unexpected error fetching URL: ${url}`, error); const context: ErrorContext = { url }; throw createUnexpectedError(error, context); } finally { // Clean up the timeout to prevent memory leaks clearTimeout(timeoutId); } } - src/index.ts:29-70 (schema)Type guard validating the runtime shape of arguments for web_url_read, ensuring url is a string and optional parameters have correct types.
export function isWebUrlReadArgs(args: unknown): args is { url: string; startChar?: number; maxLength?: number; section?: string; paragraphRange?: string; readHeadings?: boolean; } { if ( typeof args !== "object" || args === null || !("url" in args) || typeof (args as { url: string }).url !== "string" ) { return false; } const urlArgs = args as any; // Convert empty strings to undefined for optional string parameters if (urlArgs.section === "") urlArgs.section = undefined; if (urlArgs.paragraphRange === "") urlArgs.paragraphRange = undefined; // Validate optional parameters if (urlArgs.startChar !== undefined && (typeof urlArgs.startChar !== "number" || urlArgs.startChar < 0)) { return false; } if (urlArgs.maxLength !== undefined && (typeof urlArgs.maxLength !== "number" || urlArgs.maxLength < 1)) { return false; } if (urlArgs.section !== undefined && typeof urlArgs.section !== "string") { return false; } if (urlArgs.paragraphRange !== undefined && typeof urlArgs.paragraphRange !== "string") { return false; } if (urlArgs.readHeadings !== undefined && typeof urlArgs.readHeadings !== "boolean") { return false; } return true; } - src/index.ts:129-151 (registration)Registration of the web_url_read tool handler in the CallToolRequestSchema handler, dispatching to fetchAndConvertToMarkdown with pagination options.
} else if (name === "web_url_read") { if (!isWebUrlReadArgs(args)) { throw new Error("Invalid arguments for URL reading"); } const paginationOptions = { startChar: args.startChar, maxLength: args.maxLength, section: args.section, paragraphRange: args.paragraphRange, readHeadings: args.readHeadings, }; const result = await fetchAndConvertToMarkdown(mcpServer, args.url, 10000, paginationOptions); return { content: [ { type: "text", text: result, }, ], }; - src/url-reader.ts:170-200 (helper)Helper function that applies pagination options to markdown content: optionally extracts only headings, filters by section heading, filters by paragraph range, and applies character-based startChar/maxLength slicing.
function applyPaginationOptions(markdownContent: string, options: PaginationOptions): string { let result = markdownContent; // Apply heading extraction first if requested if (options.readHeadings) { return extractHeadings(result); } // Apply section extraction if (options.section) { result = extractSection(result, options.section); if (result === "") { return `Section "${options.section}" not found in the content.`; } } // Apply paragraph range filtering if (options.paragraphRange) { result = extractParagraphRange(result, options.paragraphRange); if (result === "") { return `Paragraph range "${options.paragraphRange}" is invalid or out of bounds.`; } } // Apply character-based pagination last if (options.startChar !== undefined || options.maxLength !== undefined) { result = applyCharacterPagination(result, options.startChar, options.maxLength); } return result; }