import fs from "fs/promises";
import path from "path";
import pdf from "pdf-parse/lib/pdf-parse.js"; // Try direct import if main export fails, or just 'pdf-parse'
// Usually pdf-parse default export is the function
// But in ESM 'pdf-parse' might be tricky.
// Let's try standard import first.
import pdfParse from "pdf-parse";
function render_page(pageData) {
const pageNum = pageData.pageNumber;
// Extract pages 7 to 15
if (pageNum < 7 || pageNum > 15) {
return "";
}
let render_options = {
normalizeWhitespace: false,
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function (textContent) {
let lastY, text = '';
for (let item of textContent.items) {
if (lastY == item.transform[5] || !lastY) {
text += item.str;
}
else {
text += '\n' + item.str;
}
lastY = item.transform[5];
}
return `\n--- Page ${pageNum} ---\n` + text;
});
}
async function main() {
const filePath = path.resolve("./pdg/rpp2023-rev-resonances.pdf");
try {
const dataBuffer = await fs.readFile(filePath);
const data = await pdfParse(dataBuffer, {
pagerender: render_page
});
console.log(data.text);
} catch (error) {
console.error("Error:", error);
}
}
main();