Local RAG

Overview Schema Related Servers Score Discussions

sentence-splitter.ts•4.27 KiB

// Sentence Splitter for Semantic Chunking // Created: 2025-12-27 // Purpose: Split text into sentences using Intl.Segmenter (Unicode standard) // ============================================ // Constants // ============================================ /** * Placeholder for code blocks during processing */ const CODE_BLOCK_PLACEHOLDER = '\u0000CODE_BLOCK\u0000' /** * Placeholder for inline code during processing */ const INLINE_CODE_PLACEHOLDER = '\u0000INLINE_CODE\u0000' // ============================================ // Types // ============================================ interface CodeBlockInfo { placeholder: string content: string } // ============================================ // Helper Functions // ============================================ /** * Extract and replace code blocks with placeholders */ function extractCodeBlocks(text: string): { text: string; blocks: CodeBlockInfo[] } { const blocks: CodeBlockInfo[] = [] let processedText = text // Extract fenced code blocks (```...```) const codeBlockRegex = /```[\s\S]*?```/g let index = 0 const codeBlockMatches = text.matchAll(codeBlockRegex) for (const match of codeBlockMatches) { const placeholder = `${CODE_BLOCK_PLACEHOLDER}${index}${CODE_BLOCK_PLACEHOLDER}` blocks.push({ placeholder, content: match[0] }) processedText = processedText.replace(match[0], placeholder) index++ } // Extract inline code (`...`) const inlineCodeRegex = /`[^`]+`/g const inlineMatches = processedText.matchAll(inlineCodeRegex) for (const match of inlineMatches) { const placeholder = `${INLINE_CODE_PLACEHOLDER}${index}${INLINE_CODE_PLACEHOLDER}` blocks.push({ placeholder, content: match[0] }) processedText = processedText.replace(match[0], placeholder) index++ } return { text: processedText, blocks } } /** * Restore code blocks from placeholders */ function restoreCodeBlocks(sentences: string[], blocks: CodeBlockInfo[]): string[] { return sentences.map((sentence) => { let restored = sentence for (const block of blocks) { restored = restored.replace(block.placeholder, block.content) } return restored }) } // ============================================ // Intl.Segmenter-based splitting // ============================================ // Create segmenters for supported languages // Using 'und' (undetermined) as fallback for general Unicode support const segmenter = new Intl.Segmenter('und', { granularity: 'sentence' }) /** * Split text into sentences using Intl.Segmenter * * Uses the Unicode Text Segmentation standard (UAX #29) via Intl.Segmenter. * This provides multilingual support for sentence boundary detection. * * Note: Intl.Segmenter may split on abbreviations like "Mr." or "e.g." * These edge cases are acceptable for semantic chunking as: * 1. Short fragments will be grouped with adjacent sentences by similarity * 2. Fragments below minChunkLength are filtered out * * @param text - The text to split into sentences * @returns Array of sentences */ export function splitIntoSentences(text: string): string[] { // Handle empty input if (!text || text.trim().length === 0) { return [] } // Extract code blocks to protect them from splitting const { text: processedText, blocks } = extractCodeBlocks(text) // Split on paragraph boundaries first // biome-ignore lint/suspicious/noControlCharactersInRegex: Intentional use of NULL character as placeholder delimiter const paragraphs = processedText.split(/\n{2,}|\n(?=\S)|(?<=\u0000)\n/) const sentences: string[] = [] for (const paragraph of paragraphs) { const trimmedParagraph = paragraph.trim() if (!trimmedParagraph) continue // Check if it's a markdown heading (treat as single sentence) if (/^#{1,6}\s/.test(trimmedParagraph)) { sentences.push(trimmedParagraph) continue } // Use Intl.Segmenter for sentence splitting const segments = segmenter.segment(trimmedParagraph) for (const segment of segments) { const trimmed = segment.segment.trim() if (trimmed) { sentences.push(trimmed) } } } // Restore code blocks const restoredSentences = restoreCodeBlocks(sentences, blocks) // Filter empty sentences and trim return restoredSentences.map((s) => s.trim()).filter((s) => s.length > 0) }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shinpr/mcp-local-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

sentence-splitter.ts•4.27 KiB