MCP Web Docs

Overview Schema Related Servers Score Discussions

docs-crawler.ts•3.43 KiB

import { URL } from 'url'; import { CrawlResult, DocsCrawlerType, WebCrawler } from '../types.js'; import { CrawleeCrawler, StorageState } from './crawlee-crawler.js'; import { GitHubCrawler } from './github.js'; import { logger } from '../util/logger.js'; export class DocsCrawler implements WebCrawler { private readonly GITHUB_HOST = 'github.com'; private readonly MIN_PAGES = 2; // Require at least 2 pages for component libraries private isAborting = false; private storageState?: StorageState; private pathPrefix?: string; constructor( private readonly maxDepth: number = 4, private readonly maxRequestsPerCrawl: number = 1000, private readonly githubToken?: string, private readonly onProgress?: (progress: number, description: string) => void ) {} /** * Set an optional path prefix to restrict crawling to URLs under this path. * Only pages whose path starts with this prefix will be crawled. * Example: '/oss/javascript/langchain' would only crawl pages under that path. */ setPathPrefix(prefix: string): void { this.pathPrefix = prefix; logger.info(`[DocsCrawler] Path prefix restriction set: ${prefix}`); } /** * Set authentication storage state (cookies) to use when crawling */ setStorageState(state: StorageState): void { this.storageState = state; logger.info(`[DocsCrawler] Set storage state with ${state.cookies?.length || 0} cookies`); } async *crawl(url: string): AsyncGenerator<CrawlResult, DocsCrawlerType> { const startUrl = new URL(url); logger.debug(`[DocsCrawler] Starting crawl of ${startUrl}`); if (this.isAborting) { logger.debug('[DocsCrawler] Crawl aborted'); return 'crawlee'; } // Handle GitHub repositories if (startUrl.host === this.GITHUB_HOST) { logger.debug('[DocsCrawler] Detected GitHub repository'); const githubCrawler = new GitHubCrawler(this.maxDepth, this.maxRequestsPerCrawl, this.githubToken, this.onProgress); try { for await (const page of githubCrawler.crawl(url)) { if (this.isAborting) break; yield page; } return 'github'; } catch (e) { logger.debug('[DocsCrawler] GitHub crawler failed:', e); // Don't fall through to other crawlers for GitHub URLs throw e; } } // Use Crawlee for all other sites logger.debug('[DocsCrawler] Using Crawlee crawler'); const crawleeCrawler = new CrawleeCrawler(this.maxDepth, this.maxRequestsPerCrawl, this.onProgress); // Pass authentication if available if (this.storageState) { crawleeCrawler.setStorageState(this.storageState); } // Pass path prefix restriction if configured if (this.pathPrefix) { crawleeCrawler.setPathPrefix(this.pathPrefix); } let pageCount = 0; try { for await (const page of crawleeCrawler.crawl(url)) { if (this.isAborting) break; pageCount++; yield page; } if (pageCount >= this.MIN_PAGES) { logger.debug(`[DocsCrawler] Crawlee crawler successful (${pageCount} pages)`); return 'crawlee'; } logger.debug(`[DocsCrawler] Crawlee crawler found insufficient pages (${pageCount})`); throw new Error(`Crawlee crawler found only ${pageCount} pages, need at least ${this.MIN_PAGES}`); } catch (e) { logger.debug('[DocsCrawler] Crawlee crawler failed:', e); throw e; } } abort(): void { this.isAborting = true; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cosmocoder/mcp-web-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

docs-crawler.ts•3.43 KiB