Documentation MCP Server

scraper.ts•13.9 kB

// Documentation scraper with recursive chunking import { CheerioCrawler } from 'crawlee'; import { log } from 'apify'; import * as cheerio from 'cheerio'; import { v4 as uuidv4 } from 'uuid'; import { DocChunk, DocMetadata, ScraperOptions, ScrapedPage, ParsedSection } from './types.js'; import { generateEmbeddings, estimateTokens } from './embeddings.js'; import { getVectorDB } from './vectordb.js'; // Chunking configuration const MIN_CHUNK_TOKENS = 100; const TARGET_CHUNK_TOKENS = 600; const MAX_CHUNK_TOKENS = 1000; /** * Parse HTML content into structured sections based on headers */ function parseHtmlToSections($: cheerio.CheerioAPI): ParsedSection[] { const sections: ParsedSection[] = []; const contentSelector = 'article, main, .content, body'; const content = $(contentSelector).first(); // Get all headers and content elements const elements: Array<{ type: 'header' | 'content'; level?: number; text: string }> = []; content.find('h1, h2, h3, h4, h5, h6, p, ul, ol, pre, blockquote, table').each((_, el) => { const $el = $(el); const tagName = el.tagName.toLowerCase(); if (tagName.match(/^h[1-6]$/)) { const level = parseInt(tagName[1], 10); elements.push({ type: 'header', level, text: $el.text().trim() }); } else { // For code blocks, preserve formatting let text: string; if (tagName === 'pre') { text = $el.text().trim(); if (text) { text = '```\n' + text + '\n```'; } } else if (tagName === 'table') { // Simple table to text conversion const rows: string[] = []; $el.find('tr').each((_, row) => { const cells: string[] = []; $(row).find('th, td').each((_, cell) => { cells.push($(cell).text().trim()); }); rows.push(cells.join(' | ')); }); text = rows.join('\n'); } else { text = $el.text().trim(); } if (text) { elements.push({ type: 'content', text }); } } }); // Build hierarchical section tree function buildSectionTree(startIdx: number, parentLevel: number): { sections: ParsedSection[]; endIdx: number } { const result: ParsedSection[] = []; let i = startIdx; let currentSection: ParsedSection | null = null; let contentBuffer: string[] = []; while (i < elements.length) { const el = elements[i]; if (el.type === 'header' && el.level !== undefined) { // Save previous section's content if (currentSection && contentBuffer.length > 0) { currentSection.content = contentBuffer.join('\n\n'); contentBuffer = []; } // If we hit a header at or above parent level, we're done with this level if (el.level <= parentLevel) { break; } // Create new section currentSection = { level: el.level, heading: el.text, content: '', children: [], }; result.push(currentSection); i++; // Recursively get children const childResult = buildSectionTree(i, el.level); currentSection.children = childResult.sections; i = childResult.endIdx; } else { // Content element contentBuffer.push(el.text); i++; } } // Save final section's content if (currentSection && contentBuffer.length > 0) { currentSection.content = contentBuffer.join('\n\n'); } else if (!currentSection && contentBuffer.length > 0) { // Content before any header - create intro section result.unshift({ level: 0, heading: 'Introduction', content: contentBuffer.join('\n\n'), children: [], }); } return { sections: result, endIdx: i }; } return buildSectionTree(0, 0).sections; } /** * Flatten sections into chunks with proper context */ function sectionsToChunks( sections: ParsedSection[], docId: string, docTitle: string, docUrl: string, parentPath: string[] = [], ): DocChunk[] { const chunks: DocChunk[] = []; let chunkIndex = 0; function processSection(section: ParsedSection, path: string[]): void { const currentPath = section.heading ? [...path, section.heading] : path; const content = section.content.trim(); if (content) { const tokens = estimateTokens(content); // If content is too large, split by paragraphs if (tokens > MAX_CHUNK_TOKENS) { const paragraphs = content.split(/\n\n+/); let buffer = ''; let bufferTokens = 0; for (const para of paragraphs) { const paraTokens = estimateTokens(para); if (bufferTokens + paraTokens > TARGET_CHUNK_TOKENS && buffer) { // Emit current buffer as chunk chunks.push({ id: uuidv4(), doc_id: docId, doc_title: docTitle, doc_url: docUrl, section_path: currentPath, heading: section.heading || currentPath[currentPath.length - 1] || 'Content', content: buffer.trim(), token_count: bufferTokens, chunk_index: chunkIndex++, }); buffer = para; bufferTokens = paraTokens; } else { buffer += (buffer ? '\n\n' : '') + para; bufferTokens += paraTokens; } } // Emit remaining buffer if (buffer && bufferTokens >= MIN_CHUNK_TOKENS) { chunks.push({ id: uuidv4(), doc_id: docId, doc_title: docTitle, doc_url: docUrl, section_path: currentPath, heading: section.heading || currentPath[currentPath.length - 1] || 'Content', content: buffer.trim(), token_count: bufferTokens, chunk_index: chunkIndex++, }); } else if (buffer && chunks.length > 0) { // Merge small remainder with previous chunk const lastChunk = chunks[chunks.length - 1]; lastChunk.content += '\n\n' + buffer.trim(); lastChunk.token_count += bufferTokens; } } else if (tokens >= MIN_CHUNK_TOKENS) { // Content is appropriate size chunks.push({ id: uuidv4(), doc_id: docId, doc_title: docTitle, doc_url: docUrl, section_path: currentPath, heading: section.heading || currentPath[currentPath.length - 1] || 'Content', content, token_count: tokens, chunk_index: chunkIndex++, }); } else if (chunks.length > 0) { // Too small - merge with previous chunk const lastChunk = chunks[chunks.length - 1]; lastChunk.content += '\n\n' + content; lastChunk.token_count += tokens; } else { // First chunk is small - still create it chunks.push({ id: uuidv4(), doc_id: docId, doc_title: docTitle, doc_url: docUrl, section_path: currentPath, heading: section.heading || 'Introduction', content, token_count: tokens, chunk_index: chunkIndex++, }); } } // Process children for (const child of section.children) { processSection(child, currentPath); } } for (const section of sections) { processSection(section, parentPath); } return chunks; } /** * Generate a summary from the first chunk or intro section */ function generateSummary(chunks: DocChunk[]): string { if (chunks.length === 0) return ''; // Find intro or first chunk const introChunk = chunks.find((c) => c.heading.toLowerCase().includes('intro')) || chunks[0]; const summary = introChunk.content.substring(0, 300); // Truncate at sentence boundary if possible const lastPeriod = summary.lastIndexOf('.'); if (lastPeriod > 100) { return summary.substring(0, lastPeriod + 1); } return summary + '...'; } /** * Extract top-level sections from chunks */ function extractSections(chunks: DocChunk[]): string[] { const sections = new Set<string>(); for (const chunk of chunks) { if (chunk.section_path.length > 0) { sections.add(chunk.section_path[0]); } } return Array.from(sections); } /** * Scrape documentation and store with chunking and embeddings */ export async function scrapeDocs(options: ScraperOptions = {}): Promise<{ docsProcessed: number; chunksCreated: number }> { const { startUrls = [ 'https://docs.apify.com/sdk/js/docs', 'https://docs.apify.com/sdk/js/reference', ], maxPages = 100, } = options; // Auto-generate URL patterns from start URLs const urlPatterns = startUrls.map((url) => { try { const urlObj = new URL(url); const base = urlObj.origin + urlObj.pathname.replace(/\/$/, ''); return `${base}/**`; } catch { return url; } }); log.info('Auto-generated URL patterns:', urlPatterns); const scrapedPages: ScrapedPage[] = []; let pagesProcessed = 0; const crawler = new CheerioCrawler({ maxRequestsPerCrawl: maxPages, async requestHandler({ request, $, enqueueLinks }) { log.info(`Scraping: ${request.url}`); // Extract title const title = $('h1').first().text().trim() || $('title').first().text().trim() || $('h2').first().text().trim() || 'Untitled'; // Get full HTML for parsing const html = $.html(); // Determine page type let type: 'api' | 'guide' | 'example' = 'guide'; if (request.url.includes('/api/') || request.url.includes('/reference/')) type = 'api'; if (request.url.includes('/examples/') || request.url.includes('/example')) type = 'example'; scrapedPages.push({ url: request.url, title, html, type, }); pagesProcessed++; // Enqueue more links await enqueueLinks({ globs: urlPatterns, exclude: ['**/*.pdf', '**/*.zip', '**/*.png', '**/*.jpg'], }); }, }); // Run crawler await crawler.run(startUrls); log.info(`Scraped ${scrapedPages.length} pages`); // Process pages: chunk, embed, store const vectorDb = getVectorDB(); let totalChunks = 0; for (let i = 0; i < scrapedPages.length; i++) { const page = scrapedPages[i]; log.info(`Processing page ${i + 1}/${scrapedPages.length}: ${page.title}`); // Parse HTML to sections const $ = cheerio.load(page.html); const sections = parseHtmlToSections($); // Generate doc ID const docId = `doc-${i + 1}`; // Convert sections to chunks const chunks = sectionsToChunks(sections, docId, page.title, page.url); if (chunks.length === 0) { log.warning(`No chunks generated for: ${page.url}`); continue; } log.info(`Generated ${chunks.length} chunks for: ${page.title}`); // Generate embeddings for all chunks const texts = chunks.map((c) => `${c.heading}\n\n${c.content}`); const embeddings = await generateEmbeddings(texts); // Attach embeddings to chunks for (let j = 0; j < chunks.length; j++) { chunks[j].embedding = embeddings[j]; } // Create metadata const metadata: DocMetadata = { id: docId, title: page.title, url: page.url, summary: generateSummary(chunks), sections: extractSections(chunks), total_chunks: chunks.length, type: page.type, }; // Store in database await vectorDb.upsertDocMetadata(metadata); await vectorDb.upsertChunks(chunks); totalChunks += chunks.length; log.info(`Stored ${chunks.length} chunks for doc: ${docId}`); } log.info(`Total: ${scrapedPages.length} docs, ${totalChunks} chunks`); return { docsProcessed: scrapedPages.length, chunksCreated: totalChunks }; } // Re-export types for backwards compatibility export type { DocChunk, DocMetadata };

Loading blob content...

Latest Blog Posts

How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash
What is Streamable HTTP in MCP?
By punkpeye on January 2, 2026.
Streamable HTTP
What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/wuyuwenj/api-doc-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scraper.ts•13.9 kB