scrape_html
Extract text, links, and images from web pages for data collection and analysis. Specify a URL and choose what content to retrieve.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| url | Yes | ||
| extractText | No | ||
| extractLinks | No | ||
| extractImages | No |
Implementation Reference
- src/tools/web-tools.ts:120-148 (handler)The main handler function that fetches HTML from the provided URL, extracts specified content (text, links, images), and returns structured results.async ({ url, extractText, extractLinks, extractImages }) => { return wrapToolExecution(async () => { const response = await fetch(url); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const results = extractHtmlContent(html, extractText, extractLinks, extractImages); return { content: [{ type: "text" as const, text: JSON.stringify(results, null, 2) }], metadata: { url, extracted: { text: extractText, links: extractLinks, images: extractImages } } }; }, { errorCode: ERROR_CODES.HTTP_REQUEST, context: "Failed to scrape HTML" }); }
- src/tools/web-tools.ts:114-119 (schema)Zod input schema validating the URL and optional extraction flags for text, links, and images.{ url: z.string().url("Valid URL is required"), extractText: z.boolean().optional().default(DEFAULTS.EXTRACT_TEXT), extractLinks: z.boolean().optional().default(DEFAULTS.EXTRACT_LINKS), extractImages: z.boolean().optional().default(DEFAULTS.EXTRACT_IMAGES) },
- src/tools/web-tools.ts:112-150 (registration)Registers the scrape_html tool with the MCP server, including schema and handler.function registerScrapeHtml(server: McpServer): void { server.tool("scrape_html", { url: z.string().url("Valid URL is required"), extractText: z.boolean().optional().default(DEFAULTS.EXTRACT_TEXT), extractLinks: z.boolean().optional().default(DEFAULTS.EXTRACT_LINKS), extractImages: z.boolean().optional().default(DEFAULTS.EXTRACT_IMAGES) }, async ({ url, extractText, extractLinks, extractImages }) => { return wrapToolExecution(async () => { const response = await fetch(url); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const results = extractHtmlContent(html, extractText, extractLinks, extractImages); return { content: [{ type: "text" as const, text: JSON.stringify(results, null, 2) }], metadata: { url, extracted: { text: extractText, links: extractLinks, images: extractImages } } }; }, { errorCode: ERROR_CODES.HTTP_REQUEST, context: "Failed to scrape HTML" }); } ); }
- src/tools/web-tools.ts:155-176 (helper)Helper function that conditionally extracts text, links, and images from HTML based on flags.function extractHtmlContent( html: string, extractText: boolean, extractLinks: boolean, extractImages: boolean ): HtmlExtraction { const results: HtmlExtraction = {}; if (extractText) { results.text = extractTextFromHtml(html); } if (extractLinks) { results.links = extractLinksFromHtml(html); } if (extractImages) { results.images = extractImagesFromHtml(html); } return results; }
- src/tools/web-tools.ts:181-188 (helper)Helper to extract plain text from HTML by stripping scripts, styles, tags, and normalizing whitespace.function extractTextFromHtml(html: string): string { return html .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, "") .replace(/<style[^>]*>[\s\S]*?<\/style>/gi, "") .replace(/<[^>]+>/g, " ") .replace(/\s+/g, " ") .trim(); }