mcp-for-docs

Overview Schema Related Servers Score Discussions

index.ts•9.97 KiB

import { chromium, Browser } from 'playwright'; import PQueue from 'p-queue'; import { ContentParser, SiteConfig } from './parser.js'; import { normalizeUrl, isSameDomain, extractDomainName, urlToFilename, matchesPatterns } from '../utils/url.js'; import { writeFileContent, getDocumentationPath, fileExists } from '../utils/file.js'; import { DocumentationCategorizer } from '../categorizer/index.js'; import { loadConfig } from '../config/index.js'; import { Config } from '../config/types.js'; export interface CrawlOptions { url: string; max_depth?: number; force_refresh?: boolean; rate_limit?: number; include_patterns?: string[]; exclude_patterns?: string[]; onProgress?: (status: CrawlStatus) => void; } export interface CrawlStatus { discovered: number; processed: number; saved: number; errors: number; currentUrl?: string; } export interface CrawlResult { success: boolean; stats: CrawlStatus; savedFiles: string[]; errors: string[]; category: 'tools' | 'apis'; name: string; } /** * Documentation crawler */ export class DocumentationCrawler { private browser: Browser | null = null; private parser: ContentParser; private categorizer: DocumentationCategorizer; private queue: PQueue; private visited = new Set<string>(); private discovered = new Set<string>(); private errors: string[] = []; private savedFiles: string[] = []; private config: Config | null = null; constructor() { this.parser = new ContentParser(); this.categorizer = new DocumentationCategorizer(); this.queue = new PQueue({ concurrency: 1, interval: 1000, intervalCap: 2 }); // Default 2 requests per second } private async getConfig(): Promise<Config> { if (!this.config) { this.config = await loadConfig(); } return this.config; } /** * Crawl documentation from a starting URL */ async crawl(options: CrawlOptions): Promise<CrawlResult> { try { await this.initialize(); const appConfig = await this.getConfig(); // First, fetch the homepage content for categorization const homepageContent = await this.fetchPageContent(options.url); // Use categorizer to determine category const categorizationResult = await this.categorizer.categorize(options.url, homepageContent); const category = categorizationResult.category; // Log categorization decision for transparency console.log(`Categorization: ${category} (confidence: ${categorizationResult.confidence.toFixed(2)})`); console.log(`Reasons: ${categorizationResult.reasons.join('; ')}`); // Get site config for other settings (but ignore hardcoded category) const siteConfig = this.parser.getSiteConfig(options.url) || await this.inferSiteConfig(options.url); const name = siteConfig.name; // Check if documentation already exists and not forcing refresh if (!options.force_refresh) { const docPath = await getDocumentationPath(category, name); const indexExists = await fileExists(`${docPath}/index.md`); if (indexExists) { return { success: true, stats: { discovered: 0, processed: 0, saved: 0, errors: 0 }, savedFiles: [], errors: ['Documentation already exists. Use force_refresh to update.'], category, name, }; } } // Setup rate limiting from config or options const rateLimit = options.rate_limit || appConfig.crawler.defaultRateLimit; this.queue = new PQueue({ concurrency: 1, interval: 1000, intervalCap: rateLimit }); // Reset state this.visited.clear(); this.discovered.clear(); this.errors = []; this.savedFiles = []; // Start crawling await this.crawlRecursive( options.url, 0, options.max_depth || siteConfig.max_depth || appConfig.crawler.defaultMaxDepth, options, category, name ); const stats: CrawlStatus = { discovered: this.discovered.size, processed: this.visited.size, saved: this.savedFiles.length, errors: this.errors.length, }; return { success: this.errors.length === 0, stats, savedFiles: this.savedFiles, errors: this.errors, category, name, }; } finally { await this.cleanup(); } } /** * Initialize browser */ private async initialize(): Promise<void> { if (!this.browser) { this.browser = await chromium.launch({ headless: true }); } } /** * Cleanup resources */ private async cleanup(): Promise<void> { if (this.browser) { await this.browser.close(); this.browser = null; } } /** * Recursively crawl pages */ private async crawlRecursive( url: string, depth: number, maxDepth: number, options: CrawlOptions, category: 'tools' | 'apis', name: string ): Promise<void> { const normalizedUrl = normalizeUrl(url); // Skip if already visited or too deep if (this.visited.has(normalizedUrl) || depth > maxDepth) { return; } // Check URL patterns if (options.include_patterns && !matchesPatterns(url, options.include_patterns)) { return; } if (options.exclude_patterns && matchesPatterns(url, options.exclude_patterns)) { return; } // Only crawl same domain if (!isSameDomain(url, options.url)) { return; } this.visited.add(normalizedUrl); try { await this.queue.add(async () => { options.onProgress?.({ discovered: this.discovered.size, processed: this.visited.size, saved: this.savedFiles.length, errors: this.errors.length, currentUrl: url, }); await this.processPage(url, category, name); }); // Only discover new links if we haven't reached max depth if (depth < maxDepth) { const links = await this.extractLinksFromPage(url); for (const link of links) { const normalizedLink = normalizeUrl(link); if (!this.discovered.has(normalizedLink)) { this.discovered.add(normalizedLink); await this.crawlRecursive(link, depth + 1, maxDepth, options, category, name); } } } } catch (error) { const errorMessage = `Failed to process ${url}: ${error instanceof Error ? error.message : String(error)}`; this.errors.push(errorMessage); console.error(errorMessage); } } /** * Process a single page */ private async processPage(url: string, category: 'tools' | 'apis', name: string): Promise<void> { if (!this.browser) { throw new Error('Browser not initialized'); } const appConfig = await this.getConfig(); const page = await this.browser.newPage(); try { // Set user agent await page.setExtraHTTPHeaders({ 'User-Agent': appConfig.crawler.userAgent }); // Navigate to page await page.goto(url, { waitUntil: 'networkidle', timeout: appConfig.crawler.pageTimeout }); // Get page content const html = await page.content(); // Extract content and title const title = this.parser.extractTitle(html, url); const content = this.parser.extractContent(html, url); if (!content.trim()) { throw new Error('No content extracted'); } // Generate filename const filename = urlToFilename(url); const docPath = await getDocumentationPath(category, name); const filePath = `${docPath}/${filename}`; // Create frontmatter const frontmatter = [ '---', `title: "${title.replace(/"/g, '\\"')}"`, `url: "${url}"`, `date: "${new Date().toISOString()}"`, `category: "${category}"`, `tool: "${name}"`, '---', '', ].join('\\n'); // Write file const finalContent = frontmatter + content; await writeFileContent(filePath, finalContent); this.savedFiles.push(filePath); } finally { await page.close(); } } /** * Extract links from a page */ private async extractLinksFromPage(url: string): Promise<string[]> { if (!this.browser) { return []; } const appConfig = await this.getConfig(); const page = await this.browser.newPage(); try { await page.goto(url, { waitUntil: 'networkidle', timeout: appConfig.crawler.pageTimeout }); const html = await page.content(); return this.parser.extractLinks(html, url); } catch (error) { console.error(`Failed to extract links from ${url}:`, error); return []; } finally { await page.close(); } } /** * Infer site configuration from URL (category is now determined by categorizer) */ private async inferSiteConfig(url: string): Promise<SiteConfig> { const appConfig = await this.getConfig(); const name = extractDomainName(url); return { name, category: 'tools', // Default, but will be overridden by categorizer max_depth: appConfig.crawler.defaultMaxDepth, }; } /** * Fetch page content for analysis */ private async fetchPageContent(url: string): Promise<string> { if (!this.browser) { await this.initialize(); } const appConfig = await this.getConfig(); const page = await this.browser!.newPage(); try { await page.setExtraHTTPHeaders({ 'User-Agent': appConfig.crawler.userAgent }); await page.goto(url, { waitUntil: 'networkidle', timeout: appConfig.crawler.pageTimeout }); const html = await page.content(); return this.parser.extractContent(html, url); } catch (error) { console.error(`Failed to fetch content from ${url}:`, error); return ''; } finally { await page.close(); } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shayonpal/mcp-for-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•9.97 KiB