export interface ChunkOptions {
maxChunkSize: number;
overlap: number;
}
const defaults: ChunkOptions = {
maxChunkSize: 2000,
overlap: 256,
};
const SEPARATORS = ['\n\n', '\n', '. ', ', ', ' ', ''];
/**
* Recursive character text splitter.
* Tries to split on paragraph boundaries first, then sentences, etc.
*/
export function chunkText(text: string, opts: Partial<ChunkOptions> = {}): string[] {
const { maxChunkSize, overlap } = { ...defaults, ...opts };
if (text.length <= maxChunkSize) {
return [text.trim()].filter(Boolean);
}
return recursiveSplit(text, SEPARATORS, maxChunkSize, overlap);
}
function recursiveSplit(
text: string,
separators: string[],
maxSize: number,
overlap: number
): string[] {
if (text.length <= maxSize) {
return [text.trim()].filter(Boolean);
}
const sep = separators.find((s) => (s === '' ? true : text.includes(s))) ?? '';
const parts = sep === '' ? [...text] : text.split(sep);
const chunks: string[] = [];
let current = '';
for (const part of parts) {
const candidate = current ? current + sep + part : part;
if (candidate.length > maxSize && current) {
chunks.push(current.trim());
// Overlap: keep the last `overlap` chars from current
const overlapStart = Math.max(0, current.length - overlap);
current = current.slice(overlapStart) + sep + part;
} else {
current = candidate;
}
}
if (current.trim()) {
chunks.push(current.trim());
}
// If any chunk is still too large, split it further
const result: string[] = [];
const nextSeps = separators.slice(separators.indexOf(sep) + 1);
for (const chunk of chunks) {
if (chunk.length > maxSize && nextSeps.length > 0) {
result.push(...recursiveSplit(chunk, nextSeps, maxSize, overlap));
} else {
result.push(chunk);
}
}
return result;
}