Crawl4AI MCP Server

crawl4ai-mcp
src
crawlers
implementations

crawl4ai.ts•3.76 KiB

import type { ICrawlerConfig, ICrawlResult, IFilterOptions, } from '../../types/crawler' import type { BaseCrawlerOptions } from './base' import { ECrawlImpl } from '../../constants' import { CRAWL4AI_API_TOKEN, CRAWL4AI_URL } from '../../env' import { CrawlerError, CrawlerErrorType } from '../../errors/crawler' import { createRequest } from '../../request' import { BaseCrawler } from './base' /** * Crawl4AI 爬虫实现 */ export class Crawl4AICrawler extends BaseCrawler { public readonly type = ECrawlImpl.Crawl4AI public readonly name = 'Crawl4AI' public readonly description = '基于 Crawl4AI 服务的智能网页内容提取和 Markdown 转换' private readonly request: ReturnType<typeof createRequest> private config: ICrawlerConfig constructor(options: BaseCrawlerOptions) { super(options) this.config = this.getConfig() this.request = createRequest({ baseUrl: this.config.url, authorization: this.config.token, }) } /** * 获取爬虫配置 */ getConfig(): ICrawlerConfig { return { url: CRAWL4AI_URL, token: CRAWL4AI_API_TOKEN, excludedTags: ['script', 'style', 'noscript'], } } /** * 抓取页面内容 * @param url 目标URL * @param options 过滤选项 * @returns 爬虫结果 */ async crawl( url: string, _options?: IFilterOptions, ): Promise<ICrawlResult | undefined> { if (!this.validateUrl(url)) { throw CrawlerError.invalidUrlError(url) } try { // 验证爬虫是否可用 const isAvailable = await this.isAvailable() if (!isAvailable) { throw CrawlerError.crawlerUnavailableError(this.name) } // 构建请求参数 const requestBody = { urls: [url], crawler_config: { type: 'CrawlerRunConfig', params: { excluded_tags: this.config.excludedTags, stream: false, cache_mode: 'BYPASS', }, }, } // 调用 crawl4ai 服务 const response = await this.request.post('crawl', { json: requestBody, }) const data = (await response.json()) as any this.log.info('fetch_web result', data) if (data.success === false) { throw new CrawlerError( CrawlerErrorType.NETWORK_ERROR, `Crawl4AI 爬取失败: ${data.message || 'Crawl failed'}`, data, ) } const firstResult = data.results[0] if (!firstResult) { throw new CrawlerError( CrawlerErrorType.CONTENT_EXTRACTION_ERROR, '获取的页面内容为空', ) } // 构建爬虫结果 const result: ICrawlResult = { title: firstResult.metadata?.['og:title'] || firstResult.metadata?.title || '', content: firstResult.markdown?.raw_markdown || '', url, contentType: 'text', } return result } catch (error) { if (error instanceof CrawlerError) { throw error } throw CrawlerError.contentExtractionError( `Failed to crawl ${url}: ${ error instanceof Error ? error.message : 'Unknown error' }`, url, error instanceof Error ? error : undefined, ) } } /** * 验证URL是否有效 * @param url 目标URL * @returns 是否有效 */ validateUrl(url: string): boolean { try { const urlObj = new URL(url) return ['http:', 'https:'].includes(urlObj.protocol) } catch { return false } } /** * 检查爬虫是否可用 * @returns 是否可用 */ async isAvailable(): Promise<boolean> { try { const response = await this.request.get('health', { timeout: 5000, }) return response.ok } catch { return false } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/gkzhb/crawl4ai-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

crawl4ai.ts•3.76 KiB