"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractTextFromPdf = extractTextFromPdf;
exports.chunkText = chunkText;
const fs_1 = __importDefault(require("fs"));
const pdf_parse_1 = __importDefault(require("pdf-parse"));
const crypto_1 = __importDefault(require("crypto"));
async function extractTextFromPdf(filePath) {
const dataBuffer = fs_1.default.readFileSync(filePath);
const pages = [];
const options = {
pagerender: (pageData) => {
// This callback is called for each page
// We want to capture the text content of each page
// pdf-parse's default render returns a promise that resolves to text
// But we need to capture it.
// Actually, pdf-parse documentation says pagerender should return text.
// We can accumulate it here? No, it's async.
// A better way with pdf-parse is to let it render and then split by page breaks if it inserts them.
// By default pdf-parse joins pages with \n\n.
// But we want explicit page numbers.
// Let's use a custom render to just extract text and we will rely on the fact that pdf-parse calls this sequentially?
// Or better, we can use the `max` option to parse page by page? No, that's inefficient.
// Alternative: Use the `version` option to 'default' which gives text.
// But to get per-page text, we need to hook into `pagerender`.
return pageData.getTextContent()
.then((textContent) => {
let lastY, text = '';
for (let item of textContent.items) {
if (lastY == item.transform[5] || !lastY) {
text += item.str;
}
else {
text += '\n' + item.str;
}
lastY = item.transform[5];
}
// Store page text with page number (1-based)
// pageData.pageIndex is 0-based
pages.push({ text, pageNumber: pageData.pageIndex + 1 });
return text;
});
}
};
await (0, pdf_parse_1.default)(dataBuffer, options);
// Sort pages by pageNumber just in case
pages.sort((a, b) => a.pageNumber - b.pageNumber);
return pages;
}
function chunkText(text, chunkSize, chunkOverlap) {
// Legacy chunker (if used directly) - now we use recursive splitter mostly
const chunks = [];
let start = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
const chunkText = text.slice(start, end);
chunks.push({
id: crypto_1.default.createHash("md5").update(chunkText + start.toString()).digest("hex"),
text: chunkText
});
start += chunkSize - chunkOverlap;
}
return chunks;
}