Local Search MCP Server

TextChunker.ts•11 KiB

import path from 'path'; import { DocumentChunk } from '../types/index.js'; import { log } from './Logger.js'; export interface ChunkingConfig { chunkSize?: number; overlap?: number; method?: 'fixed' | 'sentence-aware' | 'paragraph-aware'; preserveCodeBlocks?: boolean; preserveMarkdownHeaders?: boolean; } export class TextChunker { private defaultConfig: Required<ChunkingConfig>; constructor(config: ChunkingConfig = {}) { log.debug('Initializing TextChunker', { config }); this.defaultConfig = { chunkSize: 1000, overlap: 200, method: 'fixed', preserveCodeBlocks: true, preserveMarkdownHeaders: true, ...config }; log.debug('TextChunker initialized successfully', this.defaultConfig); } /** * Splits text into document chunks with optional event loop yielding for large files */ async chunkText(text: string, filePath: string, config?: Partial<ChunkingConfig>): Promise<DocumentChunk[]> { const timer = log.time(`chunk-text-${path.basename(filePath)}`); const mergedConfig = { ...this.defaultConfig, ...config }; try { const processedText = this.preprocessText(text, mergedConfig); const segments = await this.splitIntoSegmentsAsync(processedText, mergedConfig); const chunks = await this.createChunksWithOverlapAsync(segments, filePath, mergedConfig); timer(); log.info('Text chunking completed', { filePath, originalTextLength: text.length, chunksCreated: chunks.length, totalTokens: chunks.reduce((sum, c) => sum + c.metadata.tokenCount, 0) }); return chunks; } catch (error: any) { log.error('Text chunking failed', error, { filePath, textLength: text.length }); throw error; } } /** * Pre-process text before chunking */ private preprocessText(text: string, config: Required<ChunkingConfig>): string { let processed = text; // Handle special content based on configuration if (config.preserveCodeBlocks && config.method === 'sentence-aware') { processed = this.preserveBlockContent(processed, '```', '```'); } if (config.preserveMarkdownHeaders) { processed = this.preserveHeaderStructure(processed); } return processed; } /** * Preserve block content by replacing with placeholders */ private preserveBlockContent(text: string, startMarker: string, endMarker: string): string { const preservedBlocks: string[] = []; let placeholderIndex = 0; return text.replace( new RegExp(`${this.escapeRegExp(startMarker)}([\\s\\S]*?)${this.escapeRegExp(endMarker)}`, 'g'), (match) => { preservedBlocks.push(match); const placeholder = `__PRESERVED_BLOCK_${placeholderIndex}__`; placeholderIndex++; return placeholder; } ); } /** * Preserve markdown header structure */ private preserveHeaderStructure(text: string): string { // Simple header preservation - in production could be more sophisticated return text; } /** * Split text into segments based on method with event loop yielding */ private async splitIntoSegmentsAsync(text: string, config: Required<ChunkingConfig>): Promise<string[]> { switch (config.method) { case 'sentence-aware': return await this.splitBySentencesAsync(text); case 'paragraph-aware': return await this.splitByParagraphsAsync(text); default: return await this.splitFixedSizeAsync(text, config.chunkSize); } } /** * Split text into segments based on method (sync version for backward compatibility) */ private splitIntoSegments(text: string, config: Required<ChunkingConfig>): string[] { switch (config.method) { case 'sentence-aware': return this.splitBySentences(text); case 'paragraph-aware': return this.splitByParagraphs(text); default: return this.splitFixedSize(text, config.chunkSize); } } /** * Split text by sentences */ private splitBySentences(text: string): string[] { // Basic sentence splitting - in production use a proper NLP library const sentences = text.split(/[.!?]+\s+/); return sentences.filter(s => s.trim().length > 0); } /** * Split text by paragraphs */ private splitByParagraphs(text: string): string[] { const paragraphs = text.split(/\n\s*\n/); return paragraphs.filter(p => p.trim().length > 0); } /** * Split text into fixed-size chunks with event loop yielding for large files */ private async splitFixedSizeAsync(text: string, size: number): Promise<string[]> { const chunks: string[] = []; let currentIndex = 0; let processedChunks = 0; while (currentIndex < text.length) { const endIndex = Math.min(currentIndex + size, text.length); const chunk = text.slice(currentIndex, endIndex); // Try to find a good breaking point (sentence or word boundary) const adjustedChunk = this.findGoodBreakpoint(chunk, text, currentIndex, size); chunks.push(adjustedChunk); currentIndex += adjustedChunk.length; processedChunks++; // Yield control every 1000 chunks to prevent blocking if (processedChunks % 1000 === 0) { log.debug(`Processed ${processedChunks} text segments, yielding control`); await new Promise(resolve => setImmediate(resolve)); } } return chunks; } /** * Split text by sentences with event loop yielding */ private async splitBySentencesAsync(text: string): Promise<string[]> { // For sentence splitting, yield periodically for very large texts const sentences = text.split(/[.!?]+\s+/); if (sentences.length > 10000) { log.debug(`Processing ${sentences.length} sentences with yielding`); await new Promise(resolve => setImmediate(resolve)); } return sentences.filter(s => s.trim().length > 0); } /** * Split text by paragraphs with event loop yielding */ private async splitByParagraphsAsync(text: string): Promise<string[]> { // For paragraph splitting, yield periodically for very large texts const paragraphs = text.split(/\n\s*\n/); if (paragraphs.length > 5000) { log.debug(`Processing ${paragraphs.length} paragraphs with yielding`); await new Promise(resolve => setImmediate(resolve)); } return paragraphs.filter(p => p.trim().length > 0); } /** * Split text into fixed-size chunks (sync version) */ private splitFixedSize(text: string, size: number): string[] { const chunks: string[] = []; let currentIndex = 0; while (currentIndex < text.length) { const endIndex = Math.min(currentIndex + size, text.length); const chunk = text.slice(currentIndex, endIndex); // Try to find a good breaking point (sentence or word boundary) const adjustedChunk = this.findGoodBreakpoint(chunk, text, currentIndex, size); chunks.push(adjustedChunk); currentIndex += adjustedChunk.length; } return chunks; } /** * Find a good breaking point in chunk */ private findGoodBreakpoint(chunk: string, original: string, startIndex: number, targetSize: number): string { if (chunk.length <= targetSize * 0.8) { return chunk; // Small chunk, keep as is } // Look for sentence endings first const lastSentenceEnd = Math.max( chunk.lastIndexOf('.'), chunk.lastIndexOf('!'), chunk.lastIndexOf('?') ); if (lastSentenceEnd > targetSize * 0.5) { return chunk.substring(0, lastSentenceEnd + 1); } // Look for word boundaries const lastSpace = chunk.lastIndexOf(' '); if (lastSpace > targetSize * 0.3) { return chunk.substring(0, lastSpace); } // Fall back to character boundary at target size return chunk.substring(0, Math.min(targetSize, chunk.length)); } /** * Create chunks with overlap and event loop yielding for large files */ private async createChunksWithOverlapAsync(segments: string[], filePath: string, config: Required<ChunkingConfig>): Promise<DocumentChunk[]> { const chunks: DocumentChunk[] = []; const processedContent = segments.join(' '); let currentIndex = 0; let chunkIndex = 0; while (currentIndex < processedContent.length) { const endIndex = Math.min(currentIndex + config.chunkSize, processedContent.length); const content = processedContent.slice(currentIndex, endIndex); // Create chunk with guaranteed integer data types const chunk: DocumentChunk = { id: `${filePath}:${chunkIndex}`, filePath, chunkIndex: Math.floor(chunkIndex), // Ensure integer content, embedding: [], // Will be filled by embedding service metadata: { fileSize: Buffer.byteLength(processedContent, 'utf-8'), lastModified: new Date(), chunkOffset: Math.floor(currentIndex), // Ensure integer tokenCount: Math.ceil(content.length / 4) // Already integer from Math.ceil } }; chunks.push(chunk); // Calculate next starting position with overlap (ensure integer) currentIndex = Math.floor(Math.max(currentIndex + config.chunkSize - config.overlap, currentIndex + 1)); chunkIndex = Math.floor(chunkIndex + 1); // Ensure integer increment // Yield control every 1000 chunks to prevent blocking main thread if (chunkIndex % 1000 === 0) { log.debug(`Created ${chunkIndex} chunks, yielding control to event loop`); await new Promise(resolve => setImmediate(resolve)); } } return chunks; } /** * Create chunks with overlap (sync version for backward compatibility) */ private createChunksWithOverlap(segments: string[], filePath: string, config: Required<ChunkingConfig>): DocumentChunk[] { const chunks: DocumentChunk[] = []; const processedContent = segments.join(' '); let currentIndex = 0; let chunkIndex = 0; while (currentIndex < processedContent.length) { const endIndex = Math.min(currentIndex + config.chunkSize, processedContent.length); const content = processedContent.slice(currentIndex, endIndex); // Create chunk with guaranteed integer data types const chunk: DocumentChunk = { id: `${filePath}:${chunkIndex}`, filePath, chunkIndex: Math.floor(chunkIndex), // Ensure integer content, embedding: [], // Will be filled by embedding service metadata: { fileSize: Buffer.byteLength(processedContent, 'utf-8'), lastModified: new Date(), chunkOffset: Math.floor(currentIndex), // Ensure integer tokenCount: Math.ceil(content.length / 4) // Already integer from Math.ceil } }; chunks.push(chunk); // Calculate next starting position with overlap (ensure integer) currentIndex = Math.floor(Math.max(currentIndex + config.chunkSize - config.overlap, currentIndex + 1)); chunkIndex = Math.floor(chunkIndex + 1); // Ensure integer increment } return chunks; } /** * Escape special regex characters */ private escapeRegExp(string: string): string { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/PatrickRuddiman/local-search-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

TextChunker.ts•11 KiB