Skip to main content
Glama
TextChunker.ts11.3 kB
import path from 'path'; import { DocumentChunk } from '../types/index.js'; import { log } from './Logger.js'; export interface ChunkingConfig { chunkSize?: number; overlap?: number; method?: 'fixed' | 'sentence-aware' | 'paragraph-aware'; preserveCodeBlocks?: boolean; preserveMarkdownHeaders?: boolean; } export class TextChunker { private defaultConfig: Required<ChunkingConfig>; constructor(config: ChunkingConfig = {}) { log.debug('Initializing TextChunker', { config }); this.defaultConfig = { chunkSize: 1000, overlap: 200, method: 'fixed', preserveCodeBlocks: true, preserveMarkdownHeaders: true, ...config }; log.debug('TextChunker initialized successfully', this.defaultConfig); } /** * Splits text into document chunks with optional event loop yielding for large files */ async chunkText(text: string, filePath: string, config?: Partial<ChunkingConfig>): Promise<DocumentChunk[]> { const timer = log.time(`chunk-text-${path.basename(filePath)}`); const mergedConfig = { ...this.defaultConfig, ...config }; try { const processedText = this.preprocessText(text, mergedConfig); const segments = await this.splitIntoSegmentsAsync(processedText, mergedConfig); const chunks = await this.createChunksWithOverlapAsync(segments, filePath, mergedConfig); timer(); log.info('Text chunking completed', { filePath, originalTextLength: text.length, chunksCreated: chunks.length, totalTokens: chunks.reduce((sum, c) => sum + c.metadata.tokenCount, 0) }); return chunks; } catch (error: any) { log.error('Text chunking failed', error, { filePath, textLength: text.length }); throw error; } } /** * Pre-process text before chunking */ private preprocessText(text: string, config: Required<ChunkingConfig>): string { let processed = text; // Handle special content based on configuration if (config.preserveCodeBlocks && config.method === 'sentence-aware') { processed = this.preserveBlockContent(processed, '```', '```'); } if (config.preserveMarkdownHeaders) { processed = this.preserveHeaderStructure(processed); } return processed; } /** * Preserve block content by replacing with placeholders */ private preserveBlockContent(text: string, startMarker: string, endMarker: string): string { const preservedBlocks: string[] = []; let placeholderIndex = 0; return text.replace( new RegExp(`${this.escapeRegExp(startMarker)}([\\s\\S]*?)${this.escapeRegExp(endMarker)}`, 'g'), (match) => { preservedBlocks.push(match); const placeholder = `__PRESERVED_BLOCK_${placeholderIndex}__`; placeholderIndex++; return placeholder; } ); } /** * Preserve markdown header structure */ private preserveHeaderStructure(text: string): string { // Simple header preservation - in production could be more sophisticated return text; } /** * Split text into segments based on method with event loop yielding */ private async splitIntoSegmentsAsync(text: string, config: Required<ChunkingConfig>): Promise<string[]> { switch (config.method) { case 'sentence-aware': return await this.splitBySentencesAsync(text); case 'paragraph-aware': return await this.splitByParagraphsAsync(text); default: return await this.splitFixedSizeAsync(text, config.chunkSize); } } /** * Split text into segments based on method (sync version for backward compatibility) */ private splitIntoSegments(text: string, config: Required<ChunkingConfig>): string[] { switch (config.method) { case 'sentence-aware': return this.splitBySentences(text); case 'paragraph-aware': return this.splitByParagraphs(text); default: return this.splitFixedSize(text, config.chunkSize); } } /** * Split text by sentences */ private splitBySentences(text: string): string[] { // Basic sentence splitting - in production use a proper NLP library const sentences = text.split(/[.!?]+\s+/); return sentences.filter(s => s.trim().length > 0); } /** * Split text by paragraphs */ private splitByParagraphs(text: string): string[] { const paragraphs = text.split(/\n\s*\n/); return paragraphs.filter(p => p.trim().length > 0); } /** * Split text into fixed-size chunks with event loop yielding for large files */ private async splitFixedSizeAsync(text: string, size: number): Promise<string[]> { const chunks: string[] = []; let currentIndex = 0; let processedChunks = 0; while (currentIndex < text.length) { const endIndex = Math.min(currentIndex + size, text.length); const chunk = text.slice(currentIndex, endIndex); // Try to find a good breaking point (sentence or word boundary) const adjustedChunk = this.findGoodBreakpoint(chunk, text, currentIndex, size); chunks.push(adjustedChunk); currentIndex += adjustedChunk.length; processedChunks++; // Yield control every 1000 chunks to prevent blocking if (processedChunks % 1000 === 0) { log.debug(`Processed ${processedChunks} text segments, yielding control`); await new Promise(resolve => setImmediate(resolve)); } } return chunks; } /** * Split text by sentences with event loop yielding */ private async splitBySentencesAsync(text: string): Promise<string[]> { // For sentence splitting, yield periodically for very large texts const sentences = text.split(/[.!?]+\s+/); if (sentences.length > 10000) { log.debug(`Processing ${sentences.length} sentences with yielding`); await new Promise(resolve => setImmediate(resolve)); } return sentences.filter(s => s.trim().length > 0); } /** * Split text by paragraphs with event loop yielding */ private async splitByParagraphsAsync(text: string): Promise<string[]> { // For paragraph splitting, yield periodically for very large texts const paragraphs = text.split(/\n\s*\n/); if (paragraphs.length > 5000) { log.debug(`Processing ${paragraphs.length} paragraphs with yielding`); await new Promise(resolve => setImmediate(resolve)); } return paragraphs.filter(p => p.trim().length > 0); } /** * Split text into fixed-size chunks (sync version) */ private splitFixedSize(text: string, size: number): string[] { const chunks: string[] = []; let currentIndex = 0; while (currentIndex < text.length) { const endIndex = Math.min(currentIndex + size, text.length); const chunk = text.slice(currentIndex, endIndex); // Try to find a good breaking point (sentence or word boundary) const adjustedChunk = this.findGoodBreakpoint(chunk, text, currentIndex, size); chunks.push(adjustedChunk); currentIndex += adjustedChunk.length; } return chunks; } /** * Find a good breaking point in chunk */ private findGoodBreakpoint(chunk: string, original: string, startIndex: number, targetSize: number): string { if (chunk.length <= targetSize * 0.8) { return chunk; // Small chunk, keep as is } // Look for sentence endings first const lastSentenceEnd = Math.max( chunk.lastIndexOf('.'), chunk.lastIndexOf('!'), chunk.lastIndexOf('?') ); if (lastSentenceEnd > targetSize * 0.5) { return chunk.substring(0, lastSentenceEnd + 1); } // Look for word boundaries const lastSpace = chunk.lastIndexOf(' '); if (lastSpace > targetSize * 0.3) { return chunk.substring(0, lastSpace); } // Fall back to character boundary at target size return chunk.substring(0, Math.min(targetSize, chunk.length)); } /** * Create chunks with overlap and event loop yielding for large files */ private async createChunksWithOverlapAsync(segments: string[], filePath: string, config: Required<ChunkingConfig>): Promise<DocumentChunk[]> { const chunks: DocumentChunk[] = []; const processedContent = segments.join(' '); let currentIndex = 0; let chunkIndex = 0; while (currentIndex < processedContent.length) { const endIndex = Math.min(currentIndex + config.chunkSize, processedContent.length); const content = processedContent.slice(currentIndex, endIndex); // Create chunk with guaranteed integer data types const chunk: DocumentChunk = { id: `${filePath}:${chunkIndex}`, filePath, chunkIndex: Math.floor(chunkIndex), // Ensure integer content, embedding: [], // Will be filled by embedding service metadata: { fileSize: Buffer.byteLength(processedContent, 'utf-8'), lastModified: new Date(), chunkOffset: Math.floor(currentIndex), // Ensure integer tokenCount: Math.ceil(content.length / 4) // Already integer from Math.ceil } }; chunks.push(chunk); // Calculate next starting position with overlap (ensure integer) currentIndex = Math.floor(Math.max(currentIndex + config.chunkSize - config.overlap, currentIndex + 1)); chunkIndex = Math.floor(chunkIndex + 1); // Ensure integer increment // Yield control every 1000 chunks to prevent blocking main thread if (chunkIndex % 1000 === 0) { log.debug(`Created ${chunkIndex} chunks, yielding control to event loop`); await new Promise(resolve => setImmediate(resolve)); } } return chunks; } /** * Create chunks with overlap (sync version for backward compatibility) */ private createChunksWithOverlap(segments: string[], filePath: string, config: Required<ChunkingConfig>): DocumentChunk[] { const chunks: DocumentChunk[] = []; const processedContent = segments.join(' '); let currentIndex = 0; let chunkIndex = 0; while (currentIndex < processedContent.length) { const endIndex = Math.min(currentIndex + config.chunkSize, processedContent.length); const content = processedContent.slice(currentIndex, endIndex); // Create chunk with guaranteed integer data types const chunk: DocumentChunk = { id: `${filePath}:${chunkIndex}`, filePath, chunkIndex: Math.floor(chunkIndex), // Ensure integer content, embedding: [], // Will be filled by embedding service metadata: { fileSize: Buffer.byteLength(processedContent, 'utf-8'), lastModified: new Date(), chunkOffset: Math.floor(currentIndex), // Ensure integer tokenCount: Math.ceil(content.length / 4) // Already integer from Math.ceil } }; chunks.push(chunk); // Calculate next starting position with overlap (ensure integer) currentIndex = Math.floor(Math.max(currentIndex + config.chunkSize - config.overlap, currentIndex + 1)); chunkIndex = Math.floor(chunkIndex + 1); // Ensure integer increment } return chunks; } /** * Escape special regex characters */ private escapeRegExp(string: string): string { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickRuddiman/local-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server