Local RAG

Overview Schema Related Servers Score Discussions

semantic-chunker.ts•10.3 KiB

// Semantic Chunker implementation using Max-Min algorithm // Based on: "Max–Min semantic chunking of documents for RAG application" (Springer, 2025) import type { TextChunk } from './index.js' import { splitIntoSentences } from './sentence-splitter.js' // ============================================ // Type Definitions // ============================================ /** * Semantic Chunker configuration * Based on paper recommendations: hardThreshold=0.6, initConst=1.5, c=0.9 */ export interface SemanticChunkerConfig { /** Hard threshold for minimum similarity (default: 0.6) */ hardThreshold: number /** Initial constant for first sentence pair (default: 1.5) */ initConst: number /** Scaling constant for threshold calculation (default: 0.9) */ c: number /** Minimum chunk length in characters (default: 50) */ minChunkLength: number } /** * Embedder interface for generating embeddings */ export interface EmbedderInterface { embedBatch(texts: string[]): Promise<number[][]> } // ============================================ // Performance Optimization Constants // ============================================ /** * Number of recent sentences to compare in getMinSimilarity. * Based on Max-Min paper's experimental conditions (median 5 sentences per chunk). * Reduces complexity from O(k²) to O(WINDOW_SIZE²) = O(25) = O(1). */ const WINDOW_SIZE = 5 /** * Maximum number of sentences per chunk before forced split. * Safety limit to prevent computational explosion on homogeneous documents. * Set to 3x the paper's median chunk size for reasonable margin. */ const MAX_SENTENCES = 15 /** * Check if a chunk is garbage (should be filtered out) * * Criteria (language-agnostic): * 1. Empty after trimming * 2. Contains alphanumeric -> valid content (keep) * 3. Only decoration characters (----, ====, etc.) -> garbage * 4. Single character repeated >80% of text -> garbage * * Note: Applied after minChunkLength filter * * @param text - Chunk text to check * @returns true if chunk is garbage and should be removed */ export function isGarbageChunk(text: string): boolean { const trimmed = text.trim() if (trimmed.length === 0) return true // If contains any alphanumeric, consider valid content if (/[a-zA-Z0-9]/.test(trimmed)) return false // Decoration line patterns only (----, ====, ****, etc.) if (/^[\-=_.*#|~`@!%^&*()\[\]{}\\/<>:+\s]+$/.test(trimmed)) return true // Excessive repetition of single character (>80%) const charCounts = new Map<string, number>() for (const char of trimmed) { charCounts.set(char, (charCounts.get(char) ?? 0) + 1) } const maxCount = Math.max(...charCounts.values()) if (maxCount / trimmed.length > 0.8) return true return false } // ============================================ // Default Configuration // ============================================ export const DEFAULT_SEMANTIC_CHUNKER_CONFIG: SemanticChunkerConfig = { hardThreshold: 0.6, initConst: 1.5, c: 0.9, minChunkLength: 50, } // ============================================ // SemanticChunker Class // ============================================ /** * Semantic chunker using Max-Min algorithm * * The algorithm groups consecutive sentences based on semantic similarity: * 1. Split text into sentences * 2. Generate embeddings for all sentences * 3. For each sentence, decide whether to add to current chunk or start new chunk * 4. Decision is based on comparing max similarity with new sentence vs min similarity within chunk * * Key insight: A sentence belongs to a chunk if its maximum similarity to any chunk member * is greater than the minimum similarity between existing chunk members (with threshold adjustment) */ export class SemanticChunker { private readonly config: SemanticChunkerConfig constructor(config: Partial<SemanticChunkerConfig> = {}) { this.config = { ...DEFAULT_SEMANTIC_CHUNKER_CONFIG, ...config } } /** * Split text into semantically coherent chunks * * @param text - The text to chunk * @param embedder - Embedder to generate sentence embeddings * @returns Array of text chunks */ async chunkText(text: string, embedder: EmbedderInterface): Promise<TextChunk[]> { // Handle empty input if (!text || text.trim().length === 0) { return [] } // Split into sentences const sentences = splitIntoSentences(text) if (sentences.length === 0) { return [] } // Generate embeddings for all sentences const embeddings = await embedder.embedBatch(sentences) // Apply Max-Min algorithm to group sentences into chunks const sentenceGroups = this.groupSentences(sentences, embeddings) // Convert groups to TextChunks const chunks: TextChunk[] = [] let chunkIndex = 0 for (const group of sentenceGroups) { const chunkText = group.join(' ') // Filter out chunks that are too short or garbage if (chunkText.length >= this.config.minChunkLength && !isGarbageChunk(chunkText)) { chunks.push({ text: chunkText, index: chunkIndex, }) chunkIndex++ } } return chunks } /** * Group sentences into chunks using Max-Min algorithm */ private groupSentences(sentences: string[], embeddings: number[][]): string[][] { if (sentences.length === 0) return [] if (sentences.length === 1) return [[sentences[0] ?? '']] const groups: string[][] = [] let currentGroup: string[] = [] let currentGroupEmbeddings: number[][] = [] for (let i = 0; i < sentences.length; i++) { const sentence = sentences[i] const embedding = embeddings[i] if (!sentence || !embedding) continue if (currentGroup.length === 0) { // Start new group with first sentence currentGroup.push(sentence) currentGroupEmbeddings.push(embedding) } else if (currentGroup.length === 1) { // Special case for second sentence (init phase) const firstEmbedding = currentGroupEmbeddings[0] if (!firstEmbedding) continue const similarity = this.cosineSimilarity(firstEmbedding, embedding) if (this.config.initConst * similarity > this.config.hardThreshold) { // Add to current group currentGroup.push(sentence) currentGroupEmbeddings.push(embedding) } else { // Start new group groups.push([...currentGroup]) currentGroup = [sentence] currentGroupEmbeddings = [embedding] } } else { // Force split if chunk reaches MAX_SENTENCES (safety limit for performance) if (currentGroup.length >= MAX_SENTENCES) { groups.push([...currentGroup]) currentGroup = [sentence] currentGroupEmbeddings = [embedding] continue } // Normal case: check if sentence should join current group const shouldAdd = this.shouldAddToChunk(embedding, currentGroupEmbeddings) if (shouldAdd) { currentGroup.push(sentence) currentGroupEmbeddings.push(embedding) } else { // Start new group groups.push([...currentGroup]) currentGroup = [sentence] currentGroupEmbeddings = [embedding] } } } // Don't forget the last group if (currentGroup.length > 0) { groups.push(currentGroup) } return groups } /** * Decide if a sentence should be added to the current chunk * Based on Max-Min algorithm from the paper */ private shouldAddToChunk(newEmbedding: number[], chunkEmbeddings: number[][]): boolean { // Calculate min similarity within current chunk const minSim = this.getMinSimilarity(chunkEmbeddings) // Calculate max similarity between new sentence and chunk const maxSim = this.getMaxSimilarity(newEmbedding, chunkEmbeddings) // Calculate dynamic threshold const threshold = this.calculateThreshold(minSim, chunkEmbeddings.length) return maxSim > threshold } /** * Get minimum pairwise similarity within a chunk. * Only compares the last WINDOW_SIZE sentences for O(1) complexity. * This approximation is valid because recent sentences are most relevant * for determining chunk coherence (per Max-Min paper's experimental setup). */ private getMinSimilarity(embeddings: number[][]): number { if (embeddings.length < 2) return 1.0 // Only compare the last WINDOW_SIZE embeddings to reduce O(k²) to O(1) const startIdx = Math.max(0, embeddings.length - WINDOW_SIZE) const windowEmbeddings = embeddings.slice(startIdx) let minSim = 1.0 for (let i = 0; i < windowEmbeddings.length; i++) { for (let j = i + 1; j < windowEmbeddings.length; j++) { const embI = windowEmbeddings[i] const embJ = windowEmbeddings[j] if (!embI || !embJ) continue const sim = this.cosineSimilarity(embI, embJ) if (sim < minSim) { minSim = sim } } } return minSim } /** * Get maximum similarity between a sentence and any sentence in the chunk */ private getMaxSimilarity(embedding: number[], chunkEmbeddings: number[][]): number { let maxSim = -1.0 for (const chunkEmb of chunkEmbeddings) { const sim = this.cosineSimilarity(embedding, chunkEmb) if (sim > maxSim) { maxSim = sim } } return maxSim } /** * Calculate dynamic threshold based on chunk size * threshold = max(c * minSim * sigmoid(|C|), hardThreshold) */ private calculateThreshold(minSim: number, chunkSize: number): number { const sigmoidValue = this.sigmoid(chunkSize) const dynamicThreshold = this.config.c * minSim * sigmoidValue return Math.max(dynamicThreshold, this.config.hardThreshold) } /** * Sigmoid function */ private sigmoid(x: number): number { return 1 / (1 + Math.exp(-x)) } /** * Calculate cosine similarity between two vectors * Public for testing */ cosineSimilarity(vec1: number[], vec2: number[]): number { if (vec1.length !== vec2.length || vec1.length === 0) { return 0 } let dotProduct = 0 let norm1 = 0 let norm2 = 0 for (let i = 0; i < vec1.length; i++) { const v1 = vec1[i] ?? 0 const v2 = vec2[i] ?? 0 dotProduct += v1 * v2 norm1 += v1 * v1 norm2 += v2 * v2 } const denominator = Math.sqrt(norm1) * Math.sqrt(norm2) if (denominator === 0) return 0 return dotProduct / denominator } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/shinpr/mcp-local-rag'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

semantic-chunker.ts•10.3 KiB