Skip to main content
Glama

Simple Memory Extension MCP Server

by gmacev
chunking-util.ts5.88 kB
import { logger } from './logger.js'; /** * Configuration for text chunking */ export interface ChunkingConfig { // Maximum tokens per chunk (default matches E5 model constraints) maxTokens: number; // Tokens to overlap between chunks for context preservation overlapTokens: number; // Rough estimation of characters per token (varies by language) charsPerToken: number; } // Default configuration optimized for the E5 model export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = { maxTokens: 400, // Target 400 tokens to stay safely below 512 limit overlapTokens: 50, // ~12.5% overlap to maintain context charsPerToken: 4, // Rough estimate - varies by language }; /** * Estimates token count from character count * This is a rough approximation - actual tokenization depends on model and language */ export function estimateTokenCount( text: string, charsPerToken: number = DEFAULT_CHUNKING_CONFIG.charsPerToken ): number { return Math.ceil(text.length / charsPerToken); } /** * Chunk text by semantic boundaries like paragraphs and sentences * Tries to respect natural text boundaries while staying within token limits */ export function chunkTextBySemanticBoundaries( text: string, config: ChunkingConfig = DEFAULT_CHUNKING_CONFIG ): string[] { logger.debug(`Chunking text of length ${text.length} characters`); // If text is already small enough, return as-is if (estimateTokenCount(text, config.charsPerToken) <= config.maxTokens) { logger.debug('Text fits in a single chunk, no chunking needed'); return [text]; } const chunks: string[] = []; // First split by double newlines (paragraphs) const paragraphs = text.split(/\n\s*\n/); logger.debug(`Split into ${paragraphs.length} paragraphs`); let currentChunk = ''; let currentTokenCount = 0; // Process paragraph by paragraph for (let i = 0; i < paragraphs.length; i++) { const para = paragraphs[i]; const paraTokens = estimateTokenCount(para, config.charsPerToken); // If this paragraph alone exceeds max tokens, split it into sentences if (paraTokens > config.maxTokens) { logger.debug(`Large paragraph found (est. ${paraTokens} tokens), splitting into sentences`); // If we have accumulated content in current chunk, save it first if (currentChunk) { chunks.push(currentChunk); currentChunk = ''; currentTokenCount = 0; } // Split large paragraph into sentences and process them const sentences = para.split(/(?<=[.!?])\s+/); let sentenceChunk = ''; let sentenceTokenCount = 0; for (const sentence of sentences) { const sentenceTokens = estimateTokenCount(sentence, config.charsPerToken); // If single sentence exceeds limit, we have to split it by character count if (sentenceTokens > config.maxTokens) { logger.debug( `Very long sentence found (est. ${sentenceTokens} tokens), splitting by character count` ); // Save any accumulated content first if (sentenceChunk) { chunks.push(sentenceChunk); sentenceChunk = ''; sentenceTokenCount = 0; } // Force split the long sentence into multiple chunks const maxChars = config.maxTokens * config.charsPerToken; for (let j = 0; j < sentence.length; j += maxChars) { const subChunk = sentence.substring(j, j + maxChars); chunks.push(subChunk); } } // If adding this sentence exceeds limit, save current and start new else if (sentenceTokenCount + sentenceTokens > config.maxTokens) { chunks.push(sentenceChunk); // Start new chunk with overlap if possible if (sentenceChunk && config.overlapTokens > 0) { // Extract last N tokens worth of text as overlap const overlapChars = config.overlapTokens * config.charsPerToken; const overlapText = sentenceChunk.substring( Math.max(0, sentenceChunk.length - overlapChars) ); sentenceChunk = overlapText + ' ' + sentence; sentenceTokenCount = estimateTokenCount(sentenceChunk, config.charsPerToken); } else { sentenceChunk = sentence; sentenceTokenCount = sentenceTokens; } } // Otherwise add to current sentence chunk else { sentenceChunk = sentenceChunk ? `${sentenceChunk} ${sentence}` : sentence; sentenceTokenCount += sentenceTokens; } } // Add the last sentence chunk if not empty if (sentenceChunk) { chunks.push(sentenceChunk); } } // If adding this paragraph would exceed the token limit else if (currentTokenCount + paraTokens > config.maxTokens) { // Save current chunk chunks.push(currentChunk); // Start new chunk with overlap if possible if (currentChunk && config.overlapTokens > 0) { // Extract last N tokens worth of text as overlap const overlapChars = config.overlapTokens * config.charsPerToken; const overlapText = currentChunk.substring(Math.max(0, currentChunk.length - overlapChars)); currentChunk = overlapText + '\n\n' + para; currentTokenCount = estimateTokenCount(currentChunk, config.charsPerToken); } else { currentChunk = para; currentTokenCount = paraTokens; } } // Otherwise add to current chunk else { if (currentChunk) { currentChunk += '\n\n' + para; } else { currentChunk = para; } currentTokenCount += paraTokens; } } // Add the last chunk if not empty if (currentChunk) { chunks.push(currentChunk); } logger.debug(`Text chunked into ${chunks.length} semantic chunks`); return chunks; }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/gmacev/Simple-Memory-Extension-MCP-Server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server