import { readFileSync, writeFileSync } from "node:fs";
import pdf from "pdf-parse";
export interface PdfInfo {
pages: number;
title: string;
author: string;
subject: string;
creator: string;
producer: string;
creationDate: string;
modDate: string;
textLength: number;
}
export interface PdfTextResult {
text: string;
pages: number;
info: PdfInfo;
}
export async function extractText(filePath: string): Promise<PdfTextResult> {
const buffer = readFileSync(filePath);
const data = await pdf(buffer);
return {
text: data.text,
pages: data.numpages,
info: {
pages: data.numpages,
title: data.info?.Title || "",
author: data.info?.Author || "",
subject: data.info?.Subject || "",
creator: data.info?.Creator || "",
producer: data.info?.Producer || "",
creationDate: data.info?.CreationDate || "",
modDate: data.info?.ModDate || "",
textLength: data.text.length,
},
};
}
export async function extractPageRange(
filePath: string,
startPage: number,
endPage: number,
): Promise<string> {
const result = await extractText(filePath);
// pdf-parse doesn't give per-page text easily, so we approximate
// by splitting on form feeds or page breaks
const pages = result.text.split(/\f|\n{3,}/);
const start = Math.max(0, startPage - 1);
const end = Math.min(pages.length, endPage);
return pages.slice(start, end).join("\n\n---\n\n");
}
export async function getPdfInfo(filePath: string): Promise<PdfInfo> {
const result = await extractText(filePath);
return result.info;
}
export function searchInPdf(text: string, query: string): SearchResult[] {
const lines = text.split("\n");
const results: SearchResult[] = [];
const lowerQuery = query.toLowerCase();
for (let i = 0; i < lines.length; i++) {
if (lines[i].toLowerCase().includes(lowerQuery)) {
results.push({
line: i + 1,
text: lines[i].trim(),
context: [
lines[i - 1]?.trim() || "",
lines[i].trim(),
lines[i + 1]?.trim() || "",
]
.filter(Boolean)
.join("\n"),
});
}
}
return results;
}
export interface SearchResult {
line: number;
text: string;
context: string;
}
export function getWordCount(text: string): number {
return text
.split(/\s+/)
.filter((w) => w.length > 0).length;
}
export function getWordFrequency(
text: string,
topN: number = 20,
): { word: string; count: number }[] {
const words = text
.toLowerCase()
.replace(/[^a-z0-9\s]/g, "")
.split(/\s+/)
.filter((w) => w.length > 2);
const freq: Record<string, number> = {};
for (const word of words) {
freq[word] = (freq[word] || 0) + 1;
}
return Object.entries(freq)
.sort((a, b) => b[1] - a[1])
.slice(0, topN)
.map(([word, count]) => ({ word, count }));
}