MCP Web Docs

Overview Schema Related Servers Score Discussions

mcp-web-docs
src
processor

processor.ts•8.6 KiB

import { CrawlResult, DocumentChunk, DocumentProcessor, ProcessedDocument } from '../types.js'; import { EmbeddingsProvider } from '../embeddings/types.js'; import { processHtmlContent } from './content.js'; import { processMarkdownContent, processExtractedContent } from './markdown.js'; import { isMarkdownPath } from '../config.js'; import { logger } from '../util/logger.js'; import { parseMetadata } from './metadata-parser.js'; // Extractors that return already-formatted or plain text content (not raw HTML) const FORMATTED_CONTENT_EXTRACTORS = [ 'StorybookExtractor', 'GithubPagesExtractor', 'DefaultExtractor', // Crawlee's default extractor returns plain text, not HTML // Add more extractors here as they're implemented ]; /** * Create a DocumentChunk with parsed metadata from the content */ function createChunkWithMetadata( content: string, baseMetadata: { url: string; title: string; path: string }, startLine: number, endLine: number, vector: number[] ): DocumentChunk { const parsed = parseMetadata(content); return { content, startLine, endLine, vector, url: baseMetadata.url, title: baseMetadata.title, path: baseMetadata.path, metadata: { type: parsed.contentType, props: parsed.props.length > 0 ? parsed.props : undefined, codeBlocks: parsed.codeBlocks.length > 0 ? parsed.codeBlocks : undefined, }, }; } async function* semanticChunker( content: string, maxChunkSize: number, embeddings: EmbeddingsProvider, metadata: { url: string; title: string; path: string; } ): AsyncGenerator<DocumentChunk> { if (content.trim().length === 0) { return; } // Split content into semantic sections (paragraphs, lists, code blocks) const sections = content.split(/(?:\r?\n){2,}/); let currentChunk = ''; let startLine = 0; let currentLine = 0; let tokenCount = 0; for (const section of sections) { const sectionLines = section.split('\n'); const sectionText = section.trim(); if (sectionText.length === 0) { currentLine += sectionLines.length; continue; } // Estimate token count (rough approximation: 4 chars per token) const sectionTokens = Math.ceil(sectionText.length / 4); // If section alone is too large, split it further if (sectionTokens > maxChunkSize) { // First yield current chunk if not empty if (currentChunk.trim().length > 0) { const vector = await embeddings.embed(currentChunk); yield createChunkWithMetadata(currentChunk.trim(), metadata, startLine, currentLine - 1, vector); currentChunk = ''; } // Split large section by sentences const sentences = sectionText.match(/[^.!?]+[.!?]+/g) || [sectionText]; let sentenceChunk = ''; let sentenceTokens = 0; for (const sentence of sentences) { const nextTokens = Math.ceil(sentence.length / 4); if (sentenceTokens + nextTokens > maxChunkSize - 5) { if (sentenceChunk.trim().length > 0) { const vector = await embeddings.embed(sentenceChunk); yield createChunkWithMetadata( sentenceChunk.trim(), metadata, currentLine, currentLine + sentenceChunk.split('\n').length - 1, vector ); } sentenceChunk = sentence; sentenceTokens = nextTokens; } else { sentenceChunk += ' ' + sentence; sentenceTokens += nextTokens; } } // Yield remaining sentence chunk if (sentenceChunk.trim().length > 0) { const vector = await embeddings.embed(sentenceChunk); yield createChunkWithMetadata( sentenceChunk.trim(), metadata, currentLine, currentLine + sentenceChunk.split('\n').length - 1, vector ); } } // If adding section would exceed limit, yield current chunk and start new one else if (tokenCount + sectionTokens > maxChunkSize - 5) { if (currentChunk.trim().length > 0) { const vector = await embeddings.embed(currentChunk); yield createChunkWithMetadata(currentChunk.trim(), metadata, startLine, currentLine - 1, vector); } currentChunk = sectionText; tokenCount = sectionTokens; startLine = currentLine; } // Otherwise add section to current chunk else { if (currentChunk.length > 0) { currentChunk += '\n\n'; } currentChunk += sectionText; tokenCount += sectionTokens; } currentLine += sectionLines.length; } // Yield final chunk if not empty if (currentChunk.trim().length > 0) { const vector = await embeddings.embed(currentChunk); yield createChunkWithMetadata(currentChunk.trim(), metadata, startLine, currentLine - 1, vector); } } export class WebDocumentProcessor implements DocumentProcessor { constructor( private readonly embeddings: EmbeddingsProvider, private readonly maxChunkSize: number = 1000 ) {} async process(crawlResult: CrawlResult): Promise<ProcessedDocument> { logger.debug(`[WebDocumentProcessor] Processing ${crawlResult.url}`); logger.debug(`[WebDocumentProcessor] Content length: ${crawlResult.content.length} bytes`); logger.debug(`[WebDocumentProcessor] Extractor used: ${crawlResult.extractorUsed || 'unknown'}`); try { // Determine content type and process accordingly let processedContent; // Check if content was extracted by a formatter that outputs markdown const isFormattedContent = crawlResult.extractorUsed && FORMATTED_CONTENT_EXTRACTORS.includes(crawlResult.extractorUsed); if (isFormattedContent) { // Content is already formatted markdown from a custom extractor logger.debug(`[WebDocumentProcessor] Using extracted content processor for ${crawlResult.extractorUsed}`); processedContent = await processExtractedContent(crawlResult); } else if (isMarkdownPath(crawlResult.path)) { // Raw markdown file logger.debug(`[WebDocumentProcessor] Using markdown processor for ${crawlResult.path}`); processedContent = await processMarkdownContent(crawlResult); } else { // Raw HTML - needs parsing logger.debug(`[WebDocumentProcessor] Using HTML processor for ${crawlResult.path}`); processedContent = await processHtmlContent(crawlResult); } if (!processedContent) { logger.error(`[WebDocumentProcessor] Failed to parse document content for ${crawlResult.url}`); throw new Error('Failed to parse document content'); } logger.debug(`[WebDocumentProcessor] Successfully processed content for ${crawlResult.url}`); logger.debug(`[WebDocumentProcessor] Found ${processedContent.article.components.length} components`); logger.debug(`[WebDocumentProcessor] Creating chunks for ${processedContent.article.title}`); const chunks: DocumentChunk[] = []; let totalChunks = 0; const metadata = { url: processedContent.article.url, title: processedContent.article.title, path: processedContent.article.path, }; // Process each component separately for (const component of processedContent.article.components) { logger.debug(`[WebDocumentProcessor] Processing component: ${component.title}`); logger.debug(`[WebDocumentProcessor] Component body length: ${component.body.length} bytes`); const componentContent = `${component.title}\n\n${component.body}`; for await (const chunk of semanticChunker(componentContent, this.maxChunkSize, this.embeddings, metadata)) { chunks.push(chunk); totalChunks++; } } logger.debug(`[WebDocumentProcessor] Created ${totalChunks} chunks`); if (chunks.length === 0) { logger.warn(`[WebDocumentProcessor] No valid chunks were created for ${crawlResult.url}`); logger.warn(`[WebDocumentProcessor] Original content length: ${crawlResult.content.length}`); logger.warn(`[WebDocumentProcessor] Processed content length: ${processedContent.content.length}`); throw new Error('No valid chunks were created'); } logger.debug(`[WebDocumentProcessor] Successfully processed ${crawlResult.url}`); return { metadata: { url: crawlResult.url, title: processedContent.article.title, lastIndexed: new Date(), }, chunks, }; } catch (error) { logger.error(`[WebDocumentProcessor] Error processing ${crawlResult.url}:`, error); logger.error(`[WebDocumentProcessor] Error details:`, error instanceof Error ? error.stack : error); throw error; } } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cosmocoder/mcp-web-docs'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

processor.ts•8.6 KiB