Skip to main content
Glama
crawler.ts23.9 kB
/** * Advanced documentation crawler with rate limiting, retry logic, and progress tracking */ /* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ /* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-call */ import { load, CheerioAPI } from 'cheerio'; import fetch from 'node-fetch'; import { createHash } from 'crypto'; import { detectMinecraftVersion } from './sitemap.js'; import type { DocumentPage, DocumentSection, CodeBlock, CrawlerOptions, IndexerProgress, } from './types.js'; /** * CSS selectors for elements to remove before content extraction * These are navigation, UI, and non-content elements */ const REMOVE_SELECTORS = [ // Navigation 'nav', 'header', 'footer', '.nav', '.navigation', '.navbar', '.menu', '.sidebar', '.aside', 'aside', // Breadcrumbs & TOC '.breadcrumb', '.breadcrumbs', '.toc', '.table-of-contents', '#toc', // Language selectors (VitePress/VuePress patterns) '.VPLocalNav', '.VPNavBar', '.VPSidebar', '.VPFooter', '.VPDocFooter', '.VPLocalSearchBox', '.VPFlyout', '.VPNavBarMenu', '.VPNavBarMenuGroup', '.VPNavBarMenuLink', '.VPNavBarSocialLinks', '.VPNavBarTitle', '.VPNavBarTranslations', // Language selector! '.VPNavBarAppearance', '.VPNavScreenTranslations', '.VPMenu', '.appearance', // Theme switcher '.social-links', '[class*="language-selector"]', '[class*="lang-switch"]', '[class*="locale-"]', // Search UI '.search', '.search-box', '#search', '[role="search"]', // Edit/feedback links '.edit-link', '.page-edit', '.last-updated', '.contributors', '.prev-next', '.pager', // Misc UI '.copy-code-button', '.line-numbers', '.vp-code-group', 'button', 'script', 'style', 'noscript', 'iframe', ]; /** * Text patterns to strip from extracted content * These catch any remaining UI text that wasn't removed by selectors */ const NOISE_TEXT_PATTERNS: RegExp[] = [ // Language selector text with flags /🇺🇸\s*English\s*\([^)]*\)/gi, /🇨🇿\s*Čeština\s*\([^)]*\)/gi, /🇩🇪\s*Deutsch\s*\([^)]*\)/gi, /🇬🇷\s*Ελληνικά\s*\([^)]*\)/gi, /🇪🇸\s*Español\s*\([^)]*\)/gi, /🇫🇷\s*Français\s*\([^)]*\)/gi, /🇮🇹\s*Italiano\s*\([^)]*\)/gi, /🇯🇵\s*日本語\s*\([^)]*\)/gi, /🇰🇷\s*한국어\s*\([^)]*\)/gi, /🇵🇱\s*Polski\s*\([^)]*\)/gi, /🇧🇷\s*Português\s*\([^)]*\)/gi, /🇷🇺\s*Русский\s*\([^)]*\)/gi, /🇺🇦\s*Українська\s*\([^)]*\)/gi, /🇻🇳\s*Tiếng Việt\s*\([^)]*\)/gi, // Generic flag + language patterns /[\u{1F1E6}-\u{1F1FF}]{2}\s*[A-Za-zÀ-ÿ\u0400-\u04FF\u0370-\u03FF\u4E00-\u9FFF\uAC00-\uD7AF]+\s*\([^)]+\)/gu, // Long language selector sequences (catches "Search🇺🇸 English...Tiếng Việt (Việt...") /Search[\s\S]{0,50}?🇺🇸[\s\S]*?(?:Việt Nam|Việt\)|한국\)|日本\)|Россия\)|Brasil\)|Polska\)|Україна\))/gi, // Navigation text patterns /On this page/gi, /Table of Contents/gi, /Skip to content/gi, /Edit this page/gi, /Last updated:/gi, /Contributors:/gi, // Theme/appearance UI /Switch to dark theme/gi, /Switch to light theme/gi, /Toggle dark mode/gi, /Appearance/gi, // Copy button text /Copy code/gi, /Copied!/gi, ]; /** * Clean text by removing UI noise patterns */ function cleanText(text: string): string { let cleaned = text; for (const pattern of NOISE_TEXT_PATTERNS) { cleaned = cleaned.replace(pattern, ''); } // Normalize whitespace cleaned = cleaned.replace(/\n{3,}/g, '\n\n'); cleaned = cleaned.replace(/[ \t]+/g, ' '); cleaned = cleaned.replace(/^\s+|\s+$/gm, ''); return cleaned.trim(); } export class DocumentCrawler { private options: CrawlerOptions; private progress: IndexerProgress; private queue: string[] = []; private activeRequests = 0; private onProgress?: (progress: IndexerProgress) => void; constructor(options: Partial<CrawlerOptions> = {}) { this.options = { maxConcurrency: 3, delayMs: 1000, retryAttempts: 3, retryDelayMs: 2000, userAgent: 'mcmodding-mcp-indexer/0.1.0', timeout: 30000, ...options, }; this.progress = { total: 0, completed: 0, failed: 0, skipped: 0, startTime: new Date(), }; } /** * Set progress callback */ setProgressCallback(callback: (progress: IndexerProgress) => void) { this.onProgress = callback; } /** * Crawl multiple URLs with concurrency control */ async crawlAll(urls: string[]): Promise<DocumentPage[]> { this.queue = [...urls]; this.progress.total = urls.length; this.progress.completed = 0; this.progress.failed = 0; this.progress.skipped = 0; this.progress.startTime = new Date(); const results: DocumentPage[] = []; const promises: Promise<void>[] = []; // Start concurrent workers for (let i = 0; i < this.options.maxConcurrency; i++) { promises.push(this.worker(results)); } await Promise.all(promises); return results; } /** * Worker that processes URLs from the queue */ private async worker(results: DocumentPage[]) { while (this.queue.length > 0) { const url = this.queue.shift(); if (!url) break; this.activeRequests++; this.progress.currentUrl = url; this.updateProgress(); try { const doc = await this.crawlWithRetry(url); results.push(doc); this.progress.completed++; } catch (error) { console.error(`Failed to crawl ${url}:`, error); this.progress.failed++; } finally { this.activeRequests--; this.updateProgress(); // Rate limiting delay if (this.queue.length > 0) { await this.sleep(this.options.delayMs); } } } } /** * Crawl with retry logic */ private async crawlWithRetry(url: string): Promise<DocumentPage> { let lastError: Error | undefined; for (let attempt = 0; attempt < this.options.retryAttempts; attempt++) { try { return await this.crawlPage(url); } catch (error) { lastError = error as Error; if (attempt < this.options.retryAttempts - 1) { console.warn(`Retry ${attempt + 1}/${this.options.retryAttempts} for ${url}`); await this.sleep(this.options.retryDelayMs * (attempt + 1)); } } } throw lastError || new Error('Failed to crawl page after retries'); } /** * Crawl a single documentation page */ async crawlPage(url: string): Promise<DocumentPage> { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), this.options.timeout); try { const response = await fetch(url, { headers: { 'User-Agent': this.options.userAgent, }, signal: controller.signal, }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = load(html); return this.parsePage(url, html, $); } finally { clearTimeout(timeout); } } /** * Parse HTML into structured document */ private parsePage(url: string, html: string, $: CheerioAPI): DocumentPage { // Extract title const title = this.extractTitle($); // Extract category from URL const category = this.extractCategory(url); // Extract sections const sections = this.extractSections($); // Extract all text content for searching const content = this.extractContent($); // Calculate hash for change detection const hash = this.calculateHash(html); // Extract metadata const metadata = { crawledAt: new Date(), tags: this.extractTags($), difficulty: this.detectDifficulty($, content), }; // Detect loader from URL const loader = this.detectLoader(url); // Detect Minecraft version const minecraftVersion = detectMinecraftVersion(url, content); return { url, title, content, rawHtml: html, category, loader, minecraftVersion, sections, metadata, hash, }; } /** * Extract page title */ private extractTitle($: CheerioAPI): string { // Try multiple selectors const selectors = ['.page .sectionedit1', 'h1', 'title', '.page-title', 'header h1']; for (const selector of selectors) { const text = $(selector).first().text().trim(); if (text) return text; } return 'Untitled'; } /** * Extract category from URL */ private extractCategory(url: string): string { const match = url.match( /https?:\/\/[^/]+\/(?:.*\/)?(?:(?:\d+(?:\.\d+)*|develop)\/([^/]+)|([^/:\\s]+):)/ ); if (match) { return match[1] || 'general'; } return 'general'; } /** * Extract structured sections from the page * Handles both general docs (h1-h6 tags) and wiki.fabricmc.net structure (sectionedit classes) */ private extractSections($: CheerioAPI): DocumentSection[] { const sections: DocumentSection[] = []; // Remove UI elements first (work on a clone to not affect other extraction) for (const selector of REMOVE_SELECTORS) { $(selector).remove(); } // Try multiple selectors for main content const mainContent = $('main').first().length > 0 ? $('main').first() : $('#dokuwiki__content .page.group').first().length > 0 ? $('#dokuwiki__content .page.group').first() : $('article').first().length > 0 ? $('article').first() : $('.markdown-body').first().length > 0 ? $('.markdown-body').first() : $('.content-container').first().length > 0 ? $('.content-container').first() : $('.content').first().length > 0 ? $('.content').first() : $('#content').first().length > 0 ? $('#content').first() : $('body').first(); if (!mainContent.length) { return sections; } // Check if this is wiki.fabricmc.net structure (sectionedit classes) const wikiSections = mainContent.find('[class*="sectionedit"]'); if (wikiSections.length > 0) { // Wiki structure: sectionedit1, sectionedit2, etc. with level1, level2, etc. wikiSections.each((index: number, elem: unknown) => { const $heading = $(elem as any); const heading = cleanText($heading.text().trim()); // Extract level from class (e.g., "sectionedit2" -> level 2) const classAttr = $heading.attr('class') || ''; const levelMatch = classAttr.match(/sectionedit(\d+)/); const level = levelMatch ? parseInt(levelMatch[1]?.toString() || '', 10) : 1; // Find the next sibling div with class starting with "level" let $contentDiv = $heading.next(); while ($contentDiv.length > 0) { const divClass = $contentDiv.attr('class') || ''; if (divClass.startsWith('level')) { break; } $contentDiv = $contentDiv.next(); } if (!$contentDiv.length) { return; // Skip if no content div found } // Extract content and code blocks from the level div const content: string[] = []; const codeBlocks: CodeBlock[] = []; // Process all children of the level div $contentDiv.children().each((_: number, child: unknown) => { const $child = $(child as any); const childTagName = (child as { tagName?: string }).tagName?.toLowerCase(); // Check if this element contains code if (childTagName === 'pre' || $child.find('pre').length > 0) { const blocks = this.extractCodeBlocks($, $child); codeBlocks.push(...blocks); } else { // Extract text content and clean it const rawText = $child.text().trim(); const text = cleanText(rawText); if (text && text.length > 0) { content.push(text); } } }); // Only add section if it has meaningful content const sectionContent = content.join('\n\n'); if (heading || sectionContent || codeBlocks.length > 0) { sections.push({ heading, level, content: sectionContent, codeBlocks, order: index, }); } }); return sections; } // Standard structure: h1-h6 headings const headings = mainContent.find('h1, h2, h3, h4, h5, h6'); if (headings.length === 0) { // No headings found, create a single section with all content const rawText = mainContent.text().trim(); const allText = cleanText(rawText); if (allText) { sections.push({ heading: 'Content', level: 1, content: allText, codeBlocks: this.extractCodeBlocks($, mainContent), order: 0, }); } return sections; } // Process each heading and extract content until next heading headings.each((index: number, elem: unknown) => { const $heading = $(elem as any); const heading = cleanText($heading.text().trim()); const tagName = (elem as { tagName?: string }).tagName?.toLowerCase() || 'h2'; const level = parseInt(tagName[1] || '2', 10); // Collect content between this heading and the next const content: string[] = []; const codeBlocks: CodeBlock[] = []; // Get all siblings after this heading until the next heading let $current = $heading.next(); while ($current.length > 0) { const currentTagName = ($current.get(0) as any)?.tagName?.toLowerCase(); // Stop if we hit another heading if (currentTagName && currentTagName.match(/^h[1-6]$/)) { break; } // Check if this element or its children contain code if (currentTagName === 'pre' || $current.find('pre').length > 0) { const blocks = this.extractCodeBlocks($, $current); codeBlocks.push(...blocks); } else { // Extract text content and clean it const rawText = $current.text().trim(); const text = cleanText(rawText); if (text && text.length > 0) { content.push(text); } } $current = $current.next(); } // Only add section if it has meaningful content const sectionContent = content.join('\n\n'); if (heading || sectionContent || codeBlocks.length > 0) { sections.push({ heading, level, content: sectionContent, codeBlocks, order: index, }); } }); return sections; } /** * Extract code blocks from an element * Handles multiple wiki formats: * 1. Standard: <pre><code>...</code></pre> * 2. Wiki numbered: <pre class="code java"><ol><li><div>...</div></li></ol></pre> * 3. Wiki direct: <pre class="code java">...code...</pre> * * @param $ - CheerioAPI instance for element manipulation * @param $elem - The element to extract code blocks from (can be a <pre> itself or a container) */ private extractCodeBlocks($: CheerioAPI, $elem: any): CodeBlock[] { const blocks: CodeBlock[] = []; // Collect all <pre> elements to process // IMPORTANT: Handle case where $elem itself is a <pre> element // .find() only searches descendants, not the element itself const preList: any[] = []; if ($elem.is('pre')) { // $elem itself is a <pre> element - add it to the list preList.push($elem); } // Also find any <pre> elements within $elem (for container elements) $elem.find('pre').each((_: number, el: any) => { preList.push($(el)); }); for (const pre of preList) { const codeElem = pre.find('code'); // Extract code text - handle different wiki structures let code: string; const olElement = pre.find('ol'); if (olElement.length > 0) { // Wiki structure with numbered list: <pre><ol><li><div>code</div></li></ol></pre> // Extract text from each li element to preserve line structure const lines: string[] = []; olElement.find('li').each((_: number, li: any) => { // Now we can use $ from the parent scope const lineText = $(li).text(); lines.push(lineText); }); code = lines.join('\n').trim(); } else if (codeElem.length > 0) { // Standard structure: <pre><code>...</code></pre> code = codeElem.text().trim(); } else { // Direct content in pre: <pre class="code java">...code...</pre> code = pre.text().trim(); } if (!code) continue; // Try to detect language from class attributes // Check both code element and pre element const codeClass = codeElem.attr('class') || ''; const preClass = pre.attr('class') || ''; const className = codeClass || preClass; // Try multiple language detection patterns: // 1. "language-java" (standard markdown/prism format) // 2. "code java" (wiki format) // 3. "java" (simple format) let language = 'text'; const langMatch = className.match(/language-(\w+)/) || // language-java className.match(/\bcode\s+(\w+)/) || // code java className.match(/^(\w+)$/); // java if (langMatch && langMatch[1]) { language = langMatch[1]; // Don't use "code" as a language - fall back to text if (language === 'code') { language = 'text'; } } // Look for caption in parent figure element const figure = pre.closest('figure'); const caption = figure.length > 0 ? figure.find('figcaption').text().trim() || undefined : undefined; blocks.push({ language, code, caption, }); } return blocks; } /** * Extract clean text content */ private extractContent($: CheerioAPI): string { // Clone the document to avoid modifying original const $clone = $.root().clone(); const $doc = load($clone.html() || ''); // Remove all UI/navigation elements for (const selector of REMOVE_SELECTORS) { $doc(selector).remove(); } // Find main content area const main = $doc('main').first().length > 0 ? $doc('main').first() : $doc('article').first().length > 0 ? $doc('article').first() : $doc('.content, .markdown-body, #content').first().length > 0 ? $doc('.content, .markdown-body, #content').first() : $doc('#dokuwiki__content .page.group').first().length > 0 ? $doc('#dokuwiki__content .page.group').first() : $doc('body').first(); // Get text and clean it const rawText = main.text(); return cleanText(rawText); } /** * Extract tags/keywords from the page */ private extractTags($: CheerioAPI): string[] { const tags: string[] = []; // From meta tags $('meta[name="keywords"]').each((_: number, elem: unknown) => { const content = $(elem as any).attr('content'); if (content) { tags.push(...content.split(',').map((t: string) => t.trim())); } }); // From data attributes or classes $('[data-tags]').each((_: number, elem: unknown) => { const dataTags = $(elem as any).attr('data-tags'); if (dataTags) { tags.push(...dataTags.split(',').map((t: string) => t.trim())); } }); return [...new Set(tags)]; } /** * Detect difficulty level from content */ private detectDifficulty( _$: CheerioAPI, content: string ): 'beginner' | 'intermediate' | 'advanced' | undefined { const lower = content.toLowerCase(); if (lower.includes('getting started') || lower.includes('introduction')) { return 'beginner'; } if (lower.includes('advanced') || lower.includes('performance')) { return 'advanced'; } if (lower.includes('tutorial')) { return 'beginner'; } return undefined; } /** * Calculate content hash for change detection */ private calculateHash(content: string): string { return createHash('sha256').update(content).digest('hex').substring(0, 16); } /** * Update progress and call callback */ private updateProgress() { if (this.onProgress) { // Calculate estimated time remaining const elapsed = Date.now() - this.progress.startTime.getTime(); const rate = this.progress.completed / (elapsed / 1000); const remaining = this.progress.total - this.progress.completed - this.progress.failed; this.progress.estimatedTimeRemaining = remaining / rate; this.onProgress({ ...this.progress }); } } /** * Detect loader from URL */ private detectLoader(url: string): 'fabric' | 'neoforge' | 'shared' { try { const parsed = new URL(url); // Check for fabricmc.net or its subdomains if (parsed.host === 'fabricmc.net' || parsed.host.endsWith('.fabricmc.net')) { return 'fabric'; } if (parsed.host === 'neoforged.net' || parsed.host.endsWith('.neoforged.net')) { return 'neoforge'; } } catch { // If parsing fails (possibly a relative URL), fallback to path-based detection // ignore } if (url.includes('/fabric/')) { return 'fabric'; } if (url.includes('/neoforge/')) { return 'neoforge'; } return 'shared'; } /** * Sleep utility */ private sleep(ms: number): Promise<void> { return new Promise((resolve) => setTimeout(resolve, ms)); } } /** * Get all Fabric documentation URLs to crawl */ export function getFabricDocumentationUrls(): string[] { // TODO: Implement sitemap parsing or recursive discovery // For now, return a comprehensive list of known pages const baseUrl = 'https://docs.fabricmc.net/develop'; return [ // Getting Started `${baseUrl}/`, `${baseUrl}/getting-started/introduction`, `${baseUrl}/getting-started/setting-up-a-development-environment`, `${baseUrl}/getting-started/creating-a-project`, `${baseUrl}/getting-started/project-structure`, `${baseUrl}/getting-started/launch-game`, // Items `${baseUrl}/items/first-item`, `${baseUrl}/items/custom-item-groups`, `${baseUrl}/items/custom-item-interactions`, `${baseUrl}/items/food`, `${baseUrl}/items/tools`, `${baseUrl}/items/armor`, `${baseUrl}/items/custom-armor`, // Blocks `${baseUrl}/blocks/first-block`, `${baseUrl}/blocks/block-state`, `${baseUrl}/blocks/block-entity`, `${baseUrl}/blocks/block-entity-renderer`, `${baseUrl}/blocks/mining-levels`, // Entities `${baseUrl}/entities/effects`, `${baseUrl}/entities/damage-types`, // Rendering `${baseUrl}/rendering/basic-concepts`, `${baseUrl}/rendering/draw-context`, `${baseUrl}/rendering/hud`, // Networking `${baseUrl}/networking/payload`, `${baseUrl}/networking/channels`, // Data Generation `${baseUrl}/data-generation/`, `${baseUrl}/data-generation/recipes`, `${baseUrl}/data-generation/loot-tables`, `${baseUrl}/data-generation/tags`, `${baseUrl}/data-generation/advancement`, `${baseUrl}/data-generation/model`, // Misc `${baseUrl}/commands/basics`, `${baseUrl}/commands/arguments`, `${baseUrl}/sounds/using-sounds`, `${baseUrl}/sounds/custom`, `${baseUrl}/codecs`, // Add more as needed ]; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/OGMatrix/mcmodding-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server