Skip to main content
Glama
index-docs.ts11.6 kB
#!/usr/bin/env tsx /* eslint-disable no-console */ /* eslint-disable @typescript-eslint/no-non-null-assertion */ /* eslint-disable no-undef */ /** * Advanced documentation indexing script with full feature support * - Sitemap-based URL discovery * - Semantic search embeddings * - Version-aware indexing * - Incremental updates */ import { mkdir } from 'fs/promises'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; import os from 'os'; import { DocumentCrawler, getFabricDocumentationUrls } from '../src/indexer/crawler.js'; import { DocumentChunker } from '../src/indexer/chunker.js'; import { DocumentStore } from '../src/indexer/store.js'; import { getFabricUrlsFromSitemap, getFabricWikiUrlsFromSitemap, getNeoforgeUrlsFromSitemap, } from '../src/indexer/sitemap.js'; import { EmbeddingGenerator } from '../src/indexer/embeddings.js'; const __dirname = dirname(fileURLToPath(import.meta.url)); interface IndexOptions { force?: boolean; incremental?: boolean; useSitemap?: boolean; generateEmbeddings?: boolean; embeddingsBatchSize?: number; } /** * Calculate optimal batch size based on system resources */ function calculateOptimalBatchSize(): number { const totalMem = os.totalmem(); const freeMem = os.freemem(); const cpus = os.cpus().length; console.log('🖥️ System Resources:'); console.log(` • CPU Cores: ${cpus}`); console.log(` • Total Memory: ${(totalMem / 1024 / 1024 / 1024).toFixed(2)} GB`); console.log(` • Free Memory: ${(freeMem / 1024 / 1024 / 1024).toFixed(2)} GB`); // Base batch size let batchSize = 20; // Adjust based on memory (conservative: 100MB per batch of embeddings approx) // If we have > 8GB free, we can go big if (freeMem > 8 * 1024 * 1024 * 1024) { batchSize = 100; } else if (freeMem > 4 * 1024 * 1024 * 1024) { batchSize = 50; } else if (freeMem > 2 * 1024 * 1024 * 1024) { batchSize = 30; } else { batchSize = 10; // Low memory mode } // Adjust based on CPU (more cores = can handle more parallel processing if we were parallelizing) // For embeddings, batch size also affects inference speed. // MiniLM is small, so we can increase batch size on better CPUs. if (cpus >= 16) { batchSize = Math.min(batchSize * 2, 200); } else if (cpus >= 8) { batchSize = Math.min(batchSize * 1.5, 150); } console.log(`⚡ Optimized Batch Size: ${Math.floor(batchSize)}`); return Math.floor(batchSize); } async function main(options: IndexOptions = {}) { console.log('🚀 Starting Advanced Documentation Indexing...\n'); // Ensure data directory exists const dataDir = join(__dirname, '..', 'data'); await mkdir(dataDir, { recursive: true }); const dbPath = join(dataDir, 'mcmodding-docs.db'); const store = new DocumentStore(dbPath); try { // Get URLs to crawl let urls: string[]; if (options.useSitemap) { console.log('📡 Fetching URLs from sitemap...'); urls = []; urls.push(...(await getFabricWikiUrlsFromSitemap())); urls.push(...(await getFabricUrlsFromSitemap())); urls.push(...(await getNeoforgeUrlsFromSitemap())); if (urls.length === 0) { console.log('⚠️ Sitemap fetch failed, falling back to static list'); urls = getFabricDocumentationUrls(); } else { console.log(`✅ Fetched ${urls.length} URLs from sitemap`); } } else { urls = getFabricDocumentationUrls(); } console.log(`📋 Found ${urls.length} documentation pages to index\n`); // Initialize crawler with progress tracking const crawler = new DocumentCrawler({ maxConcurrency: 3, delayMs: 1000, retryAttempts: 3, }); crawler.setProgressCallback((progress) => { const percent = Math.round((progress.completed / progress.total) * 100); const eta = progress.estimatedTimeRemaining ? ` | ETA: ${Math.round(progress.estimatedTimeRemaining)}s` : ''; process.stdout.write( `\r⏳ Progress: ${progress.completed}/${progress.total} (${percent}%) | Failed: ${progress.failed}${eta} ` ); }); // Crawl all pages console.log('🕷️ Crawling documentation...'); const documents = await crawler.crawlAll(urls); console.log(`\n✅ Successfully crawled ${documents.length} pages\n`); // Initialize chunker const chunker = new DocumentChunker({ maxChunkSize: 1000, overlapSize: 100, preserveCodeBlocks: true, }); // Process and store documents console.log('💾 Storing documents and creating search indexes...'); let processedCount = 0; let updatedCount = 0; let skippedCount = 0; let totalChunksToEmbed = 0; // Track chunks for embedding in batches let pendingChunks: Array<{ id: string; content: string; documentId: number }> = []; let embeddingGen: EmbeddingGenerator | null = null; // Initialize embedding generator early if needed if (options.generateEmbeddings) { embeddingGen = new EmbeddingGenerator(); await embeddingGen.initialize(); } // Calculate optimal batch size if not provided const embeddingBatchSize = options.embeddingsBatchSize || calculateOptimalBatchSize(); // Helper function to process pending embeddings async function processEmbeddingBatch() { if (!options.generateEmbeddings || pendingChunks.length === 0 || !embeddingGen) { return; } const batch = pendingChunks; pendingChunks = []; // Clear for next batch const batchTexts = batch.map((c) => c.content); // Pass the optimized batch size to the generator const batchEmbeddings = await embeddingGen.generateEmbeddings(batchTexts, embeddingBatchSize); const embeddings: Array<{ chunkId: string; embedding: number[] }> = []; for (let j = 0; j < batch.length; j++) { embeddings.push({ chunkId: batch[j]!.id, embedding: batchEmbeddings[j]!, }); } // Store embeddings immediately to free memory store.storeEmbeddings(embeddings, 'Xenova/all-MiniLM-L6-v2'); // Clear arrays explicitly batchTexts.length = 0; embeddings.length = 0; // Force garbage collection hint aggressively if (global.gc) { global.gc(); } // Yield to event loop await new Promise((resolve) => setImmediate(resolve)); } for (const doc of documents) { try { // Check if document needs updating (incremental mode) if (options.incremental && !options.force) { if (!store.needsUpdate(doc.url, doc.hash)) { skippedCount++; continue; } } // Store document const documentId = store.storeDocument(doc); // Create and store chunks const chunks = chunker.chunkDocument(doc); store.storeChunks(chunks, documentId); // Collect chunks for embedding generation (but process in small batches) if (options.generateEmbeddings) { for (const chunk of chunks) { pendingChunks.push({ id: chunk.id, content: chunk.content, documentId, }); totalChunksToEmbed++; // Process embedding batch when size threshold reached if (pendingChunks.length >= embeddingBatchSize) { await processEmbeddingBatch(); } } } updatedCount++; processedCount++; // Progress indicator process.stdout.write( `\r Processed: ${processedCount}/${documents.length} | Updated: ${updatedCount} | Skipped: ${skippedCount} ` ); } catch (error) { console.error(`\n❌ Error processing ${doc.url}:`, error); } } // Process remaining chunks if (pendingChunks.length > 0) { await processEmbeddingBatch(); } console.log('\n'); // Log embedding completion if (options.generateEmbeddings && totalChunksToEmbed > 0) { console.log(`✅ Embeddings generated and stored for ${totalChunksToEmbed} chunks\n`); } // Update timestamp store.updateTimestamp(); // Show statistics console.log('📊 Indexing Statistics:'); const stats = store.getStats(); console.log(` • Total Documents: ${stats.totalDocuments}`); console.log(` • Total Sections: ${stats.totalSections}`); console.log(` • Total Code Blocks: ${stats.totalCodeBlocks}`); console.log(` • Fabric Docs: ${stats.loaders.fabric}`); console.log(` • NeoForge Docs: ${stats.loaders.neoforge}`); console.log(` • Shared Docs: ${stats.loaders.shared}`); // Show version breakdown const versions = store.getAllVersions(); if (versions.length > 0) { console.log(` • Minecraft Versions: ${versions.join(', ')}`); } // Show embedding stats if (options.generateEmbeddings) { const embStats = store.getEmbeddingStats(); console.log(` • Total Embeddings: ${embStats.totalEmbeddings}`); if (embStats.models.length > 0) { console.log( ` • Embedding Models: ${embStats.models.map((m) => `${m.model} (${m.count})`).join(', ')}` ); } } console.log(` • Last Updated: ${stats.lastUpdated.toISOString()}`); console.log(` • Index Version: ${stats.version}`); console.log(`\n✨ Indexing complete!`); console.log(` Updated: ${updatedCount} documents`); console.log(` Skipped: ${skippedCount} documents (no changes)`); console.log(` Database: ${dbPath}\n`); } catch (error) { console.error('\n💥 Indexing failed:', error); process.exit(1); } finally { store.close(); } } // Parse command line arguments const args = process.argv.slice(2); const options: IndexOptions = { force: args.includes('--force') || args.includes('-f'), incremental: args.includes('--incremental') || args.includes('-i'), useSitemap: args.includes('--sitemap') || args.includes('-s'), generateEmbeddings: args.includes('--embeddings') || args.includes('-e'), embeddingsBatchSize: 100, }; // Show help if (args.includes('--help') || args.includes('-h')) { console.log('Usage: npm run index-docs [options]'); console.log(''); console.log('Options:'); console.log(' -f, --force Force full re-index (ignore hashes)'); console.log(' -i, --incremental Incremental update (skip unchanged)'); console.log(' -s, --sitemap Fetch URLs from sitemap.xml'); console.log(' -e, --embeddings Generate semantic embeddings'); console.log(' -h, --help Show this help message'); console.log(''); console.log('Examples:'); console.log(' npm run index-docs # Standard index'); console.log(' npm run index-docs -- --incremental # Update only changed pages'); console.log(' npm run index-docs -- --sitemap # Use sitemap for URLs'); console.log(' npm run index-docs -- --embeddings # Generate embeddings'); console.log(' npm run index-docs -- -i -s -e # All features'); process.exit(0); } // Run indexer console.log('Configuration:'); console.log(` • Force re-index: ${options.force ? 'Yes' : 'No'}`); console.log(` • Incremental: ${options.incremental ? 'Yes' : 'No'}`); console.log(` • Use sitemap: ${options.useSitemap ? 'Yes' : 'No'}`); console.log(` • Generate embeddings: ${options.generateEmbeddings ? 'Yes' : 'No'}`); console.log(''); main(options).catch((error) => { console.error('Fatal error:', error); process.exit(1); });

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/OGMatrix/mcmodding-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server