Skip to main content
Glama

actors-mcp-server

Official
by apify
MIT License
7,198
465
  • Apple
get-html-skeleton.ts5.16 kB
import { z } from 'zod'; import zodToJsonSchema from 'zod-to-json-schema'; import { ApifyClient } from '../apify-client.js'; import { HelperTools, RAG_WEB_BROWSER, TOOL_MAX_OUTPUT_CHARS } from '../const.js'; import { getHtmlSkeletonCache } from '../state.js'; import type { InternalTool, ToolEntry } from '../types.js'; import { ajv } from '../utils/ajv.js'; import { isValidHttpUrl } from '../utils/generic.js'; import { stripHtml } from '../utils/html.js'; import { buildMCPResponse } from '../utils/mcp.js'; interface ScrapedPageItem { crawl: { httpStatusCode: number; httpStatusMessage: string; } metadata: { url: string; } query: string; html?: string; } const getHtmlSkeletonArgs = z.object({ url: z.string() .min(1) .describe('URL of the webpage to retrieve HTML skeleton from.'), enableJavascript: z.boolean() .optional() .default(false) .describe('Whether to enable JavaScript rendering. Enabling this may increase the time taken to retrieve the HTML skeleton.'), chunk: z.number() .optional() .default(1) .describe('Chunk number to retrieve when getting the content. The content is split into chunks to prevent exceeding the maximum tool output length.'), }); export const getHtmlSkeleton: ToolEntry = { type: 'internal', tool: { name: HelperTools.GET_HTML_SKELETON, actorFullName: HelperTools.GET_HTML_SKELETON, description: `Retrieve the HTML skeleton (clean structure) of a webpage by stripping scripts, styles, and non-essential attributes. This keeps the core HTML structure, links, images, and data attributes for analysis. Supports optional JavaScript rendering for dynamic pages. The results will include a chunked HTML skeleton if the content is large. Use the chunk parameter to paginate through the output. USAGE: - Use when you need a clean HTML structure to design selectors or parsers for scraping. USAGE EXAMPLES: - user_input: Get HTML skeleton for https://example.com - user_input: Get next chunk of HTML skeleton for https://example.com (chunk=2)`, inputSchema: zodToJsonSchema(getHtmlSkeletonArgs), ajvValidate: ajv.compile(zodToJsonSchema(getHtmlSkeletonArgs)), call: async (toolArgs) => { const { args, apifyToken } = toolArgs; const parsed = getHtmlSkeletonArgs.parse(args); if (!isValidHttpUrl(parsed.url)) { return buildMCPResponse([`The provided URL is not a valid HTTP or HTTPS URL: ${parsed.url}`]); } // Try to get from cache first let strippedHtml = getHtmlSkeletonCache.get(parsed.url); if (!strippedHtml) { // Not in cache, call the Actor for scraping const client = new ApifyClient({ token: apifyToken }); const run = await client.actor(RAG_WEB_BROWSER).call({ query: parsed.url, outputFormats: [ 'html', ], scrapingTool: parsed.enableJavascript ? 'browser-playwright' : 'raw-http', }); const datasetItems = await client.dataset(run.defaultDatasetId).listItems(); if (datasetItems.items.length === 0) { return buildMCPResponse([`The scraping Actor (${RAG_WEB_BROWSER}) did not return any output for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]); } const firstItem = datasetItems.items[0] as unknown as ScrapedPageItem; if (firstItem.crawl.httpStatusMessage.toLocaleLowerCase() !== 'ok') { return buildMCPResponse([`The scraping Actor (${RAG_WEB_BROWSER}) returned an HTTP status ${firstItem.crawl.httpStatusCode} (${firstItem.crawl.httpStatusMessage}) for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]); } if (!firstItem.html) { return buildMCPResponse([`The scraping Actor (${RAG_WEB_BROWSER}) did not return any HTML content for the URL: ${parsed.url}. Please check the Actor run for more details: ${run.id}`]); } strippedHtml = stripHtml(firstItem.html); getHtmlSkeletonCache.set(parsed.url, strippedHtml); } // Pagination logic const totalLength = strippedHtml.length; const chunkSize = TOOL_MAX_OUTPUT_CHARS; const totalChunks = Math.ceil(totalLength / chunkSize); const startIndex = (parsed.chunk - 1) * chunkSize; const endIndex = Math.min(startIndex + chunkSize, totalLength); const chunkContent = strippedHtml.slice(startIndex, endIndex); const hasNextChunk = parsed.chunk < totalChunks; const chunkInfo = `\n\n--- Chunk ${parsed.chunk} of ${totalChunks} ---\n${hasNextChunk ? `Next chunk: ${parsed.chunk + 1}` : 'End of content'}`; return buildMCPResponse([chunkContent + chunkInfo]); }, } as InternalTool, };

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/apify/actors-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server