import fs from "fs/promises";
import path from "path";
import pdf from "pdf-parse";
// Custom page render function to extract text from specific pages
function render_page(pageData) {
// Let's only extract pages 2 to 6 (where parameterizations likely start)
// pageData.pageNumber is 1-based
const pageNum = pageData.pageNumber;
if (pageNum < 2 || pageNum > 6) {
return "";
}
// Default render logic from pdf-parse documentation
let render_options = {
normalizeWhitespace: false,
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function (textContent) {
let lastY, text = '';
for (let item of textContent.items) {
if (lastY == item.transform[5] || !lastY) {
text += item.str;
}
else {
text += '\n' + item.str;
}
lastY = item.transform[5];
}
return `\n--- Page ${pageNum} ---\n` + text;
});
}
async function main() {
const filePath = path.resolve("./pdg/rpp2023-rev-resonances.pdf");
console.log(`Reading file: ${filePath}`);
try {
const dataBuffer = await fs.readFile(filePath);
// Pass the render_page option
const data = await pdf(dataBuffer, {
pagerender: render_page
});
console.log(`Extracted Text Length: ${data.text.length}`);
console.log(data.text);
} catch (error) {
console.error("Error:", error);
}
}
main();