Skip to main content
Glama
calculateChunks.ts4.21 kB
import { splitTextByLines } from '@intlayer/chokidar'; export type ChunkLineResult = { lineStart: number; lineLength: number; charStart: number; charLength: number; content: string; }; const DEFAULT_MAX_CHARS_PER_CHUNK = 800; const DEFAULT_OVERLAP_CHARS = 0; export const chunkText = ( text: string, maxCharsPerChunk: number = DEFAULT_MAX_CHARS_PER_CHUNK, overlapChars: number = DEFAULT_OVERLAP_CHARS ): ChunkLineResult[] => { if (maxCharsPerChunk <= 0) { throw new Error('maxCharsPerChunk must be greater than 0'); } const splittedText = splitTextByLines(text); // Split text into lines to facilitate the translation const lines: ChunkLineResult[] = []; let charStartAcc = 0; splittedText.forEach((line, index) => { lines.push({ content: line, lineStart: index, lineLength: 1, charStart: charStartAcc, charLength: line.length, }); charStartAcc += line.length; }); // Group lines // as long as the chunk length is less than maxCharsPerChunk // if a line longer than maxCharsPerChunk, keep it alone // if a line is not longer than maxCharsPerChunk, it is grouped const groupedLines: ChunkLineResult[] = lines.reduce( (acc: ChunkLineResult[], line) => { // If this line alone exceeds maxCharsPerChunk, keep it separate if (line.content.length > maxCharsPerChunk) { acc.push(line); return acc; } // If we have no chunks yet, start with this line if (acc.length === 0) { acc.push(line); return acc; } // Get the last chunk const lastChunk = acc[acc.length - 1]; // Calculate what the combined length would be (including newline character) const combinedLength = lastChunk.content.length + line.content.length; // If combining would exceed the limit, start a new chunk if (combinedLength > maxCharsPerChunk) { acc.push(line); return acc; } // Otherwise, combine with the last chunk const combinedContent = lastChunk.content + line.content; const updatedChunk = { content: combinedContent, lineStart: lastChunk.lineStart, lineLength: lastChunk.lineLength + line.lineLength, charStart: lastChunk.charStart, charLength: combinedContent.length, }; acc[acc.length - 1] = updatedChunk; return acc; }, [] ); // If one line is longer than maxCharsPerChunk, split it into multiple chunks const splittedLines: ChunkLineResult[] = groupedLines.flatMap((line) => { const chunk: ChunkLineResult[] = []; if (line.content.length <= maxCharsPerChunk) { chunk.push(line); return chunk; } for (let i = 0; i < line.content.length; i += maxCharsPerChunk) { const slicedContent = line.content.slice(i, i + maxCharsPerChunk); chunk.push({ content: slicedContent, lineStart: line.lineStart, lineLength: 1, charStart: line.charStart + i, charLength: slicedContent.length, }); } return chunk; }); if (overlapChars === 0) return splittedLines; const overlapChunks: ChunkLineResult[] = splittedLines.length > 0 ? [splittedLines[0]] : []; for (let i = 1; i < splittedLines.length; i++) { const previousChunk = splittedLines[i - 1]; const chunk = splittedLines[i]; const overlapContent = previousChunk.content.slice(-overlapChars); const overlapLineNb = splitTextByLines(overlapContent).length; const overlapContentWithoutPartialLine = overlapContent.slice( overlapLineNb > 1 ? overlapContent.indexOf('\n') + 1 : 0, overlapContent.length ); const newContent = overlapContentWithoutPartialLine + chunk.content; const newLineLength = splitTextByLines(newContent).length; const lineDiff = chunk.lineLength - newLineLength; const overlappedChunk = { content: newContent, lineStart: chunk.lineStart + lineDiff, lineLength: chunk.lineLength - lineDiff, charStart: chunk.charStart - overlapContentWithoutPartialLine.length, charLength: newContent.length, }; overlapChunks.push(overlappedChunk); } return overlapChunks; };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/aymericzip/intlayer'

If you have feedback or need assistance with the MCP directory API, please join our Discord server