read_url
Extract and convert web page content into clean markdown format for easy readability. Analyze text, bypass paywalls, or retrieve structured data from URLs, including links and images, using this tool.
Instructions
Extract and convert web page content to clean, readable markdown format. Perfect for reading articles, documentation, blog posts, or any web content. Use this when you need to analyze text content from websites, bypass paywalls, or get structured data. 💡 Tip: Use parallel_read_url if you need to read multiple web pages simultaneously.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | The complete URL of the webpage or PDF file to read and convert (e.g., 'https://example.com/article') | |
| withAllImages | No | Set to true to extract and return all images found on the page as structured data | |
| withAllLinks | No | Set to true to extract and return all hyperlinks found on the page as structured data |
Implementation Reference
- src/tools/jina-tools.ts:180-246 (handler)The handler function that executes the read_url tool logic. It handles both single URLs and arrays of URLs (by calling parallel helper), invokes the core read utility, formats results as YAML, applies token guardrails, and handles errors.async ({ url, withAllLinks, withAllImages }: { url: string | string[]; withAllLinks?: boolean; withAllImages?: boolean }) => { try { const props = getProps(); // Handle single URL or single-element array if (typeof url === 'string' || (Array.isArray(url) && url.length === 1)) { const singleUrl = typeof url === 'string' ? url : url[0]; // Import the utility function const { readUrlFromConfig } = await import("../utils/read.js"); // Use the shared utility function const result = await readUrlFromConfig({ url: singleUrl, withAllLinks: withAllLinks || false, withAllImages: withAllImages || false }, props.bearerToken); if ('error' in result) { return createErrorResponse(result.error); } return applyTokenGuardrail({ content: [{ type: "text" as const, text: yamlStringify(result.structuredData), }], }, props.bearerToken, getClientName()); } // Handle multiple URLs with parallel reading if (Array.isArray(url) && url.length > 1) { const urls = url.map(u => ({ url: u, withAllLinks: withAllLinks || false, withAllImages: withAllImages || false })); const uniqueUrls = urls.filter((urlConfig, index, self) => index === self.findIndex(u => u.url === urlConfig.url) ); // Import the utility functions const { executeParallelUrlReads } = await import("../utils/read.js"); // Execute parallel URL reads using the utility const results = await executeParallelUrlReads(uniqueUrls, props.bearerToken, 30000); // Format results for consistent output const contentItems: Array<{ type: 'text'; text: string }> = []; for (const result of results) { if ('success' in result && result.success) { contentItems.push({ type: "text" as const, text: yamlStringify(result.structuredData), }); } else if ('error' in result) { contentItems.push({ type: "text" as const, text: `Error reading ${result.url}: ${result.error}`, }); } } return applyTokenGuardrail({ content: contentItems, }, props.bearerToken, getClientName()); } return createErrorResponse("Invalid URL format"); } catch (error) { return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`); } },
- src/tools/jina-tools.ts:175-178 (schema)Zod input schema for the read_url tool, validating url (single string URL or array of URLs), optional withAllLinks and withAllImages booleans.{ url: z.union([z.string().url(), z.array(z.string().url())]).describe("The complete URL of the webpage or PDF file to read and convert (e.g., 'https://example.com/article'). Can be a single URL string or an array of URLs for parallel reading."), withAllLinks: z.boolean().optional().describe("Set to true to extract and return all hyperlinks found on the page as structured data"), withAllImages: z.boolean().optional().describe("Set to true to extract and return all images found on the page as structured data")
- src/tools/jina-tools.ts:172-247 (registration)Registration of the 'read_url' tool on the MCP server using server.tool(name, description, inputSchema, handler). This is called within registerJinaTools which is invoked from src/index.ts.server.tool( "read_url", "Extract and convert web page content to clean, readable markdown format. Perfect for reading articles, documentation, blog posts, or any web content. Use this when you need to analyze text content from websites, bypass paywalls, or get structured data.", { url: z.union([z.string().url(), z.array(z.string().url())]).describe("The complete URL of the webpage or PDF file to read and convert (e.g., 'https://example.com/article'). Can be a single URL string or an array of URLs for parallel reading."), withAllLinks: z.boolean().optional().describe("Set to true to extract and return all hyperlinks found on the page as structured data"), withAllImages: z.boolean().optional().describe("Set to true to extract and return all images found on the page as structured data") }, async ({ url, withAllLinks, withAllImages }: { url: string | string[]; withAllLinks?: boolean; withAllImages?: boolean }) => { try { const props = getProps(); // Handle single URL or single-element array if (typeof url === 'string' || (Array.isArray(url) && url.length === 1)) { const singleUrl = typeof url === 'string' ? url : url[0]; // Import the utility function const { readUrlFromConfig } = await import("../utils/read.js"); // Use the shared utility function const result = await readUrlFromConfig({ url: singleUrl, withAllLinks: withAllLinks || false, withAllImages: withAllImages || false }, props.bearerToken); if ('error' in result) { return createErrorResponse(result.error); } return applyTokenGuardrail({ content: [{ type: "text" as const, text: yamlStringify(result.structuredData), }], }, props.bearerToken, getClientName()); } // Handle multiple URLs with parallel reading if (Array.isArray(url) && url.length > 1) { const urls = url.map(u => ({ url: u, withAllLinks: withAllLinks || false, withAllImages: withAllImages || false })); const uniqueUrls = urls.filter((urlConfig, index, self) => index === self.findIndex(u => u.url === urlConfig.url) ); // Import the utility functions const { executeParallelUrlReads } = await import("../utils/read.js"); // Execute parallel URL reads using the utility const results = await executeParallelUrlReads(uniqueUrls, props.bearerToken, 30000); // Format results for consistent output const contentItems: Array<{ type: 'text'; text: string }> = []; for (const result of results) { if ('success' in result && result.success) { contentItems.push({ type: "text" as const, text: yamlStringify(result.structuredData), }); } else if ('error' in result) { contentItems.push({ type: "text" as const, text: `Error reading ${result.url}: ${result.error}`, }); } } return applyTokenGuardrail({ content: contentItems, }, props.bearerToken, getClientName()); } return createErrorResponse("Invalid URL format"); } catch (error) { return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`); } }, );
- src/utils/read.ts:35-114 (helper)The primary helper utility readUrlFromConfig that performs the actual web content extraction by calling the r.jina.ai API, handling headers for auth/links/images, normalizing URLs, structuring response data including title, content, links, images.export async function readUrlFromConfig( urlConfig: ReadUrlConfig, bearerToken?: string ): Promise<ReadUrlResponse> { try { // Normalize the URL first const normalizedUrl = normalizeUrl(urlConfig.url); if (!normalizedUrl) { return { error: "Invalid or unsupported URL", url: urlConfig.url }; } const headers: Record<string, string> = { 'Accept': 'application/json', 'Content-Type': 'application/json', 'X-Md-Link-Style': 'discarded', }; // Add Authorization header if bearer token is available if (bearerToken) { headers['Authorization'] = `Bearer ${bearerToken}`; } if (urlConfig.withAllLinks) { headers['X-With-Links-Summary'] = 'all'; } if (urlConfig.withAllImages) { headers['X-With-Images-Summary'] = 'true'; } else { headers['X-Retain-Images'] = 'none'; } const response = await fetch('https://r.jina.ai/', { method: 'POST', headers, body: JSON.stringify({ url: normalizedUrl }), }); if (!response.ok) { return { error: `HTTP ${response.status}: ${response.statusText}`, url: urlConfig.url }; } const data = await response.json() as any; if (!data.data) { return { error: "Invalid response data from r.jina.ai", url: urlConfig.url }; } // Prepare structured data const structuredData: any = { url: data.data.url, title: data.data.title, }; if (urlConfig.withAllLinks && data.data.links) { structuredData.links = data.data.links.map((link: [string, string]) => ({ anchorText: link[0], url: link[1] })); } if (urlConfig.withAllImages && data.data.images) { structuredData.images = data.data.images; } structuredData.content = data.data.content || ""; return { success: true, url: urlConfig.url, structuredData, withAllLinks: urlConfig.withAllLinks || false, withAllImages: urlConfig.withAllImages || false }; } catch (error) { return { error: error instanceof Error ? error.message : String(error), url: urlConfig.url }; } }
- src/utils/read.ts:7-26 (schema)TypeScript interfaces defining input config (ReadUrlConfig), result (ReadUrlResult), error (ReadUrlError), and union response type for the read_url helper functions.export interface ReadUrlConfig { url: string; withAllLinks?: boolean; withAllImages?: boolean; } export interface ReadUrlResult { success: boolean; url: string; structuredData: any; withAllLinks: boolean; withAllImages: boolean; } export interface ReadUrlError { error: string; url: string; } export type ReadUrlResponse = ReadUrlResult | ReadUrlError;