"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.RecursiveCharacterTextSplitter = void 0;
exports.recursiveChunkText = recursiveChunkText;
class RecursiveCharacterTextSplitter {
chunkSize;
chunkOverlap;
separators = ["\n\n", "\n", " ", ""];
constructor(options) {
this.chunkSize = options.chunkSize;
this.chunkOverlap = options.chunkOverlap;
}
splitText(text) {
const finalChunks = [];
let goodSplits = [text];
for (const separator of this.separators) {
const newSplits = [];
for (const s of goodSplits) {
if (s.length < this.chunkSize) {
newSplits.push(s);
}
else {
if (separator === "") {
// If we are at the empty separator (character level), just hard split
newSplits.push(...this.splitByCharacter(s));
}
else {
// Split by separator
const splits = s.split(separator);
let currentChunk = "";
for (const split of splits) {
if (currentChunk.length + split.length + separator.length > this.chunkSize) {
if (currentChunk) {
newSplits.push(currentChunk);
// Start new chunk with overlap?
// Simple overlap logic: keep last part.
// For simplicity in this implementation, we won't do complex overlap reconstruction here
// because we are splitting down.
// Instead, we will accumulate and push.
// A proper recursive splitter merges back small chunks.
currentChunk = split;
}
else {
newSplits.push(split); // Split is too big, will be handled by next separator
}
}
else {
currentChunk = currentChunk ? currentChunk + separator + split : split;
}
}
if (currentChunk)
newSplits.push(currentChunk);
}
}
}
goodSplits = newSplits;
}
return goodSplits;
}
splitByCharacter(text) {
const chunks = [];
for (let i = 0; i < text.length; i += this.chunkSize - this.chunkOverlap) {
chunks.push(text.slice(i, i + this.chunkSize));
}
return chunks;
}
createChunks(text, pageNumber) {
const textChunks = this.splitText(text);
return textChunks.map((t, i) => ({
id: i.toString(), // This ID will be overwritten or appended to later
text: t,
pageNumber: pageNumber
}));
}
}
exports.RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter;
function recursiveChunkText(pages, chunkSize, chunkOverlap) {
const splitter = new RecursiveCharacterTextSplitter({ chunkSize, chunkOverlap });
const allChunks = [];
for (const page of pages) {
const pageChunks = splitter.createChunks(page.text, page.pageNumber);
allChunks.push(...pageChunks);
}
return allChunks;
}