Skip to main content
Glama
scrape.ts2.64 kB
import { z } from 'zod'; import { Context, ToolParameters, UserError, Tool } from 'fastmcp'; import { getClient } from '@utils/client'; import type { CrawlRequest, PageOptions } from '@watercrawl/nodejs/dist/types'; interface ScrapeArgs { urls: string[]; pageOptions?: PageOptions; } const scrapeUrl = async (args: ScrapeArgs | any, { session }: Context<any>) => { const client = getClient(session?.apiKey); try { const req = await client.createBatchCrawlRequest(args.urls, {}, args.pageOptions || {}); const results = []; for await (const data of client.monitorCrawlRequest(req.uuid, true)) { if (data.type === 'result') { results.push(data.data); } if (data.type === 'state' && (data.data as CrawlRequest).status === 'finished') { break; } } return JSON.stringify({ ...req, results, }); } catch (e) { throw new UserError(String(e)); } }; const parameters = z.object({ urls: z.string().array().describe('List of URLs to scrape'), pageOptions: z .object({ exclude_tags: z.string().array().optional().describe('HTML tags to exclude'), include_tags: z.string().array().optional().describe('HTML tags to include'), wait_time: z.number().optional().describe('Time to wait for page loading in ms'), only_main_content: z.boolean().optional().describe('Extract only main content'), include_html: z.boolean().optional().describe('Include HTML in response'), include_links: z.boolean().optional().describe('Include links in response'), timeout: z.number().optional().describe('Page load timeout in ms'), accept_cookies_selector: z .string() .optional() .describe('CSS selector for accept cookies button'), locale: z.string().optional().describe('Locale for the page'), extra_headers: z.record(z.string()).optional().describe('Additional HTTP headers'), actions: z .array( z.object({ type: z.enum(['pdf', 'screenshot']).describe('Action type'), }), ) .optional() .describe('Actions to perform on the page'), }) .optional() .describe('Page scraping options'), sync: z.boolean().optional().default(true).describe('Wait for scraping to complete'), download: z.boolean().optional().default(true).describe('Download content immediately'), }); export const ScrapeTool: Tool<any, ToolParameters> = { name: 'scrape-urls', description: 'Scrape multiple(or single) URL(s) with optional configuration for page options, and more', parameters: parameters, execute: scrapeUrl, };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/watercrawl/watercrawl-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server