Synergy/DE MCP Server

chunker.ts•11.2 KiB

/** * Topic chunker for splitting long documentation topics into LLM-friendly chunks */ import type { Topic, TopicChunk } from "../../types.js"; import { logger } from "../utils/logger.js"; /** * Estimate token count for text (rough approximation: ~4 characters per token) */ function estimateTokens(text: string): number { return Math.ceil(text.length / 4); } /** * Split text into chunks based on headings and token limits * * This function can either: * 1. Re-chunk existing chunks (if bodyText is not provided) * 2. Chunk raw body text (if bodyText is provided) */ export function chunkTopic( topic: Topic, maxChunkSize: number = 1200, bodyText?: string ): TopicChunk[] { // If body text is provided, use chunkBodyText directly if (bodyText !== undefined) { return chunkBodyText(topic.id, bodyText, maxChunkSize); } // Otherwise, re-chunk existing chunks const chunks: TopicChunk[] = []; // If topic has no body chunks, return empty chunks if (!topic.body_chunks || topic.body_chunks.length === 0) { logger.warn("Topic has no body content to chunk", { topic_id: topic.id }); return []; } // Re-chunk existing chunks if needed let currentChunk = ""; let currentChunkIndex = 0; for (const existingChunk of topic.body_chunks) { const chunkText = existingChunk.text; const chunkTokens = estimateTokens(chunkText); // If single chunk fits, add it as-is if (chunkTokens <= maxChunkSize) { if (currentChunk && estimateTokens(currentChunk) + chunkTokens > maxChunkSize) { // Current chunk is full, save it and start new one chunks.push({ topic_id: topic.id, chunk_index: currentChunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); currentChunk = chunkText; } else { // Add to current chunk currentChunk = currentChunk ? `${currentChunk}\n\n${chunkText}` : chunkText; } } else { // Chunk is too large, split it by headings if (currentChunk) { chunks.push({ topic_id: topic.id, chunk_index: currentChunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); currentChunk = ""; } // Split large chunk by headings const headingRegex = /^(#{1,6})\s+(.+)$/gm; const parts: string[] = []; let lastIndex = 0; let match; while ((match = headingRegex.exec(chunkText)) !== null) { if (match.index > lastIndex) { parts.push(chunkText.substring(lastIndex, match.index)); } lastIndex = match.index; } if (lastIndex < chunkText.length) { parts.push(chunkText.substring(lastIndex)); } // Process parts for (const part of parts) { const partTokens = estimateTokens(part); if (partTokens <= maxChunkSize) { if (currentChunk && estimateTokens(currentChunk) + partTokens > maxChunkSize) { chunks.push({ topic_id: topic.id, chunk_index: currentChunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); currentChunk = part; } else { currentChunk = currentChunk ? `${currentChunk}\n\n${part}` : part; } } else { // Part is still too large, split by sentences if (currentChunk) { chunks.push({ topic_id: topic.id, chunk_index: currentChunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); currentChunk = ""; } // Split by sentences (rough approximation) const sentences = part.split(/(?<=[.!?])\s+/); let sentenceChunk = ""; for (const sentence of sentences) { const sentenceTokens = estimateTokens(sentence); if (estimateTokens(sentenceChunk) + sentenceTokens > maxChunkSize && sentenceChunk) { chunks.push({ topic_id: topic.id, chunk_index: currentChunkIndex++, text: sentenceChunk.trim(), token_count: estimateTokens(sentenceChunk), }); sentenceChunk = sentence; } else { sentenceChunk = sentenceChunk ? `${sentenceChunk} ${sentence}` : sentence; } } if (sentenceChunk) { currentChunk = sentenceChunk; } } } } } // Add remaining chunk if (currentChunk.trim()) { chunks.push({ topic_id: topic.id, chunk_index: currentChunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); } logger.debug("Chunked topic", { topic_id: topic.id, chunk_count: chunks.length, total_tokens: chunks.reduce((sum, c) => sum + (c.token_count || 0), 0), }); return chunks; } /** * Chunk a topic's body text (for initial parsing) */ export function chunkBodyText( topicId: string, bodyText: string, maxChunkSize: number = 1200 ): TopicChunk[] { const chunks: TopicChunk[] = []; if (!bodyText || bodyText.trim().length === 0) { return chunks; } // Split by headings first (markdown-style headers) const headingRegex = /^(#{1,6})\s+(.+)$/gm; const sections: Array<{ level: number; title: string; content: string }> = []; let match; // Find all headings const headingMatches: Array<{ index: number; level: number; title: string }> = []; while ((match = headingRegex.exec(bodyText)) !== null) { headingMatches.push({ index: match.index, level: match[1].length, title: match[2].trim(), }); } // If no headings, split by paragraphs if (headingMatches.length === 0) { const paragraphs = bodyText.split(/\n\s*\n/).filter((p) => p.trim()); let currentChunk = ""; let chunkIndex = 0; for (const paragraph of paragraphs) { const paraTokens = estimateTokens(paragraph); if (estimateTokens(currentChunk) + paraTokens > maxChunkSize && currentChunk) { chunks.push({ topic_id: topicId, chunk_index: chunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); currentChunk = paragraph; } else { currentChunk = currentChunk ? `${currentChunk}\n\n${paragraph}` : paragraph; } } if (currentChunk.trim()) { chunks.push({ topic_id: topicId, chunk_index: chunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); } return chunks; } // Capture content before first heading (introductory content) if (headingMatches.length > 0 && headingMatches[0].index > 0) { const introText = bodyText.substring(0, headingMatches[0].index).trim(); if (introText) { sections.push({ level: 0, // Special level for introductory content title: "", content: introText, }); } } // Process sections between headings for (let i = 0; i < headingMatches.length; i++) { const heading = headingMatches[i]; const nextHeading = headingMatches[i + 1]; const startIndex = heading.index; const endIndex = nextHeading ? nextHeading.index : bodyText.length; // Extract heading line const headingLineEnd = bodyText.indexOf("\n", startIndex); const headingLine = headingLineEnd >= 0 ? bodyText.substring(startIndex, headingLineEnd + 1) : bodyText.substring(startIndex); const contentStart = startIndex + headingLine.length; const content = bodyText.substring(contentStart, endIndex).trim(); sections.push({ level: heading.level, title: heading.title, content, }); } // Group sections into chunks let currentChunk = ""; let chunkIndex = 0; for (const section of sections) { // Format section text (introductory content has no heading) const sectionText = section.level === 0 ? section.content : `#${"#".repeat(section.level - 1)} ${section.title}\n\n${section.content}`; const sectionTokens = estimateTokens(sectionText); if (sectionTokens > maxChunkSize) { // Section is too large, save current chunk and split section if (currentChunk) { chunks.push({ topic_id: topicId, chunk_index: chunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); currentChunk = ""; } // Split large section by paragraphs const paragraphs = section.content.split(/\n\s*\n/).filter((p) => p.trim()); let sectionChunk = section.level === 0 ? "" : `#${"#".repeat(section.level - 1)} ${section.title}\n\n`; for (const paragraph of paragraphs) { const paraTokens = estimateTokens(paragraph); const minLength = section.level === 0 ? 10 : section.title.length + 10; if (estimateTokens(sectionChunk) + paraTokens > maxChunkSize && sectionChunk.trim().length > minLength) { chunks.push({ topic_id: topicId, chunk_index: chunkIndex++, text: sectionChunk.trim(), token_count: estimateTokens(sectionChunk), }); sectionChunk = section.level === 0 ? paragraph : `#${"#".repeat(section.level - 1)} ${section.title}\n\n${paragraph}`; } else { sectionChunk = `${sectionChunk}${paragraph}\n\n`; } } const minLength = section.level === 0 ? 10 : section.title.length + 10; if (sectionChunk.trim().length > minLength) { currentChunk = sectionChunk; } } else if (estimateTokens(currentChunk) + sectionTokens > maxChunkSize && currentChunk) { // Current chunk is full, save it chunks.push({ topic_id: topicId, chunk_index: chunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); currentChunk = sectionText; } else { // Add to current chunk currentChunk = currentChunk ? `${currentChunk}\n\n${sectionText}` : sectionText; } } // Add remaining chunk if (currentChunk.trim()) { chunks.push({ topic_id: topicId, chunk_index: chunkIndex++, text: currentChunk.trim(), token_count: estimateTokens(currentChunk), }); } logger.debug("Chunked body text", { topic_id: topicId, chunk_count: chunks.length, total_tokens: chunks.reduce((sum, c) => sum + (c.token_count || 0), 0), }); return chunks; } /** * Limit chunks to a maximum token count (for tool/resource responses) */ export function limitChunks(chunks: TopicChunk[], maxTokens: number = 8000): TopicChunk[] { let totalTokens = 0; const limited: TopicChunk[] = []; for (const chunk of chunks) { const chunkTokens = chunk.token_count || estimateTokens(chunk.text); if (totalTokens + chunkTokens > maxTokens) { break; } limited.push(chunk); totalTokens += chunkTokens; } if (limited.length < chunks.length) { logger.debug("Limited chunks due to token budget", { original_count: chunks.length, limited_count: limited.length, total_tokens: totalTokens, max_tokens: maxTokens, }); } return limited; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/h0ck3ystyx/synergyde-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

chunker.ts•11.2 KiB