folder-mcp

Overview Schema Related Servers Score Discussions

pdf-chunking.ts•14.7 KiB

/** * PDF Document Format-Aware Chunking * * Implements chunking that respects PDF page structure using pdf2json's * page and text block coordinate system for precise text location. */ import { ParsedContent, TextChunk, ChunkedContent, PDFMetadata, createDefaultSemanticMetadata } from '../../types/index.js'; /** * Represents a text block from pdf2json */ interface PdfTextBlock { x: number; // X coordinate y: number; // Y coordinate w: number; // Width h: number; // Height text: string; // Actual text content } /** * Represents a parsed PDF page structure */ interface PdfPageStructure { pageIndex: number; textBlocks: PdfTextBlock[]; width: number; height: number; } /** * Service for PDF document format-aware chunking */ export class PdfChunkingService { private readonly DEFAULT_MAX_TOKENS = 1000; private readonly DEFAULT_MIN_TOKENS = 100; /** * Chunk a PDF document respecting page boundaries and text blocks */ public chunkPdfDocument( content: ParsedContent, maxTokens: number = this.DEFAULT_MAX_TOKENS, minTokens: number = this.DEFAULT_MIN_TOKENS ): ChunkedContent { if (content.type !== 'pdf') { throw new Error('Content must be a PDF document'); } const metadata = content.metadata as PDFMetadata; // Get page structures from metadata (stored by parser) const pageStructures = (metadata as any).pageStructures as PdfPageStructure[]; if (!pageStructures || pageStructures.length === 0) { // Fallback to simple text chunking if no structure available return this.fallbackTextChunking(content, maxTokens, minTokens); } // Create chunks respecting page boundaries const chunks = this.createPageAwareChunks( pageStructures, content.content, maxTokens, minTokens ); return { originalContent: content, chunks, totalChunks: chunks.length }; } /** * Create chunks respecting page boundaries and text blocks */ private createPageAwareChunks( pageStructures: PdfPageStructure[], fullText: string, maxTokens: number, minTokens: number ): TextChunk[] { const chunks: TextChunk[] = []; let globalTextOffset = 0; for (const pageStructure of pageStructures) { const pageChunks = this.chunkSinglePage( pageStructure, fullText, globalTextOffset, chunks.length, maxTokens, minTokens ); chunks.push(...pageChunks); // Update global offset if (pageStructure.textBlocks.length > 0) { const pageText = pageStructure.textBlocks .map(block => block.text) .join(' '); globalTextOffset += pageText.length; } } return chunks; } /** * Chunk a single page into text chunks */ private chunkSinglePage( pageStructure: PdfPageStructure, fullText: string, textOffset: number, startChunkIndex: number, maxTokens: number, minTokens: number ): TextChunk[] { const chunks: TextChunk[] = []; if (!pageStructure.textBlocks || pageStructure.textBlocks.length === 0) { return chunks; } let currentChunk: { textBlocks: PdfTextBlock[]; text: string; tokenCount: number; startBlockIndex: number; endBlockIndex: number; minX?: number; minY?: number; maxWidth?: number; maxHeight?: number; } | null = null; for (let i = 0; i < pageStructure.textBlocks.length; i++) { const block = pageStructure.textBlocks[i]; if (!block || !block.text) continue; const blockTokens = Math.ceil(block.text.length / 4); if (!currentChunk) { // Start new chunk currentChunk = { textBlocks: [block], text: block.text, tokenCount: blockTokens, startBlockIndex: i, endBlockIndex: i, minX: block.x, minY: block.y, maxWidth: block.w, maxHeight: block.h }; } else { // Check if adding this block would exceed max tokens const potentialTokens = currentChunk.tokenCount + blockTokens; if (potentialTokens > maxTokens && currentChunk.tokenCount >= minTokens) { // Save current chunk chunks.push(this.createChunk( currentChunk.text, startChunkIndex + chunks.length, fullText, pageStructure.pageIndex, currentChunk )); // Start new chunk with this block currentChunk = { textBlocks: [block], text: block.text, tokenCount: blockTokens, startBlockIndex: i, endBlockIndex: i, minX: block.x, minY: block.y, maxWidth: block.w, maxHeight: block.h }; } else { // Add block to current chunk currentChunk.textBlocks.push(block); currentChunk.text += ' ' + block.text; currentChunk.tokenCount += blockTokens; currentChunk.endBlockIndex = i; // Update coordinate bounds if (block.x < (currentChunk.minX || Infinity)) { currentChunk.minX = block.x; } if (block.y < (currentChunk.minY || Infinity)) { currentChunk.minY = block.y; } if (block.w > (currentChunk.maxWidth || 0)) { currentChunk.maxWidth = block.w; } if (block.h > (currentChunk.maxHeight || 0)) { currentChunk.maxHeight = block.h; } } } } // Save final chunk if (currentChunk && currentChunk.text.trim()) { chunks.push(this.createChunk( currentChunk.text, startChunkIndex + chunks.length, fullText, pageStructure.pageIndex, currentChunk )); } return chunks; } /** * Create a single chunk with extraction parameters */ private createChunk( text: string, index: number, fullText: string, pageIndex: number, chunkData: { startBlockIndex: number; endBlockIndex: number; minX?: number; minY?: number; maxWidth?: number; maxHeight?: number; }, searchFromOffset: number = 0 ): TextChunk { // Find byte offsets in full text, searching from the correct position const startOffset = fullText.indexOf(text, searchFromOffset); const endOffset = startOffset >= 0 ? startOffset + text.length : searchFromOffset + text.length; // Create extraction params using factory const coordinates = chunkData.minX !== undefined && chunkData.minY !== undefined && chunkData.maxWidth !== undefined && chunkData.maxHeight !== undefined ? { x: chunkData.minX, y: chunkData.minY, width: chunkData.maxWidth, height: chunkData.maxHeight } : undefined; // No longer tracking extraction params - lazy loading retrieves content by chunk ID return { content: text, startPosition: startOffset >= 0 ? startOffset : 0, endPosition: endOffset >= 0 ? endOffset : text.length, tokenCount: Math.ceil(text.length / 4), chunkIndex: index, metadata: { sourceFile: '', sourceType: 'pdf', totalChunks: 0, hasOverlap: false }, semanticMetadata: createDefaultSemanticMetadata() }; } /** * Fallback to simple text chunking if no page structure available */ private fallbackTextChunking( content: ParsedContent, maxTokens: number, minTokens: number ): ChunkedContent { const chunks: TextChunk[] = []; const text = content.content; // Handle edge case: empty or very short content if (!text || text.trim().length === 0) { return { originalContent: content, chunks: [], totalChunks: 0 }; } // Try to split by spaces first let words = text.split(/\s+/).filter(w => w.length > 0); // CRITICAL FIX: If no spaces found (or very few), force character-based chunking // This prevents creating one massive chunk that exceeds model context window if (words.length <= 1) { // No word boundaries found - likely corrupted PDF extraction // Fall back to character-based chunking const maxChars = maxTokens * 4; // Approximate 4 chars per token const chunks: TextChunk[] = []; for (let i = 0; i < text.length; i += maxChars) { const chunkContent = text.substring(i, Math.min(i + maxChars, text.length)); const tokenCount = Math.ceil(chunkContent.length / 4); chunks.push({ content: chunkContent, startPosition: i, endPosition: Math.min(i + maxChars, text.length), tokenCount: tokenCount, chunkIndex: chunks.length, metadata: { sourceFile: '', sourceType: 'pdf', totalChunks: 0, hasOverlap: false }, semanticMetadata: createDefaultSemanticMetadata() }); } // Update total chunks chunks.forEach(chunk => { chunk.metadata.totalChunks = chunks.length; }); return { originalContent: content, chunks, totalChunks: chunks.length }; } // Normal word-based chunking when spaces are available let currentChunk = ''; let currentTokens = 0; let chunkStart = 0; for (const word of words) { const wordTokens = Math.ceil(word.length / 4); if (currentTokens + wordTokens > maxTokens && currentTokens >= minTokens) { // Save current chunk chunks.push({ content: currentChunk.trim(), startPosition: chunkStart, endPosition: chunkStart + currentChunk.length, tokenCount: currentTokens, chunkIndex: chunks.length, metadata: { sourceFile: '', sourceType: 'pdf', totalChunks: 0, hasOverlap: false }, semanticMetadata: createDefaultSemanticMetadata() }); // Start new chunk chunkStart += currentChunk.length + 1; currentChunk = word; currentTokens = wordTokens; } else { currentChunk += (currentChunk ? ' ' : '') + word; currentTokens += wordTokens; } } // Save final chunk - but ONLY if it's within size limits if (currentChunk.trim()) { // CRITICAL FIX: Check if final chunk exceeds max tokens // If it does, split it into smaller chunks if (currentTokens > maxTokens) { // Final chunk is too large - need to split it const finalText = currentChunk.trim(); const maxChars = maxTokens * 4; for (let i = 0; i < finalText.length; i += maxChars) { const chunkContent = finalText.substring(i, Math.min(i + maxChars, finalText.length)); const tokenCount = Math.ceil(chunkContent.length / 4); chunks.push({ content: chunkContent, startPosition: chunkStart + i, endPosition: chunkStart + Math.min(i + maxChars, finalText.length), tokenCount: tokenCount, chunkIndex: chunks.length, metadata: { sourceFile: '', sourceType: 'pdf', totalChunks: 0, hasOverlap: false }, semanticMetadata: createDefaultSemanticMetadata() }); } } else { // Final chunk is within limits chunks.push({ content: currentChunk.trim(), startPosition: chunkStart, endPosition: text.length, tokenCount: currentTokens, chunkIndex: chunks.length, metadata: { sourceFile: '', sourceType: 'pdf', totalChunks: 0, hasOverlap: false }, semanticMetadata: createDefaultSemanticMetadata() }); } } return { originalContent: content, chunks, totalChunks: chunks.length }; } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/okets/folder-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf-chunking.ts•14.7 KiB