pdf-processor.ts•12.7 kB
/**
* PDF content processor
* Handles PDF text extraction and conversion to markdown
*/
import pdf from 'pdf-parse';
import { ProcessedContent, ContentMetadata, DocumentStructure, ContentChunk, Heading, TableInfo } from '../types/content.js';
export interface PDFProcessorOptions {
chunkSize?: number;
preservePageBreaks?: boolean;
extractHeadings?: boolean;
pageRange?: { start: number; end: number };
}
export interface PDFPageInfo {
pageNumber: number;
text: string;
startPosition: number;
endPosition: number;
}
export class PDFProcessor {
/**
* Process PDF file and convert to markdown
*/
async processPDF(
buffer: Buffer,
options: PDFProcessorOptions = {}
): Promise<ProcessedContent> {
try {
// Parse PDF using pdf-parse
const pdfData = await pdf(buffer);
// Extract pages information
const pages = await this.extractPages(buffer, options.pageRange);
// Process text content
let fullText = pdfData.text;
// Apply page range filter if specified
if (options.pageRange) {
fullText = this.filterByPageRange(pages, options.pageRange);
}
// Clean and normalize text
const cleanText = this.cleanText(fullText);
// Extract headings if requested
const headings = options.extractHeadings ? this.extractHeadings(cleanText, pages) : [];
// Create document structure
const structure = this.analyzeStructure(cleanText, headings, pages, options.preservePageBreaks);
// Generate chunks
const chunks = this.createChunks(cleanText, pages, options.chunkSize || 4000);
// Create metadata
const metadata: ContentMetadata = {
wordCount: this.countWords(cleanText),
pageCount: pdfData.numpages,
language: this.detectLanguage(cleanText),
headings,
images: [], // PDF image extraction would require additional processing
tables: this.detectTables(cleanText),
lastProcessed: new Date()
};
// Convert to markdown format
const markdown = this.convertToMarkdown(cleanText, pages, headings, options);
return {
markdown,
metadata,
structure,
chunks
};
} catch (error) {
throw new Error(`Failed to process PDF: ${error instanceof Error ? error.message : 'Unknown error'}`);
}
}
/**
* Extract individual pages with position information
*/
private async extractPages(buffer: Buffer, pageRange?: { start: number; end: number }): Promise<PDFPageInfo[]> {
const pages: PDFPageInfo[] = [];
try {
// Parse PDF to get page-by-page content
const pdfData = await pdf(buffer);
// Split full text by pages (approximation)
const fullText = pdfData.text;
const estimatedPageLength = Math.ceil(fullText.length / pdfData.numpages);
for (let i = 0; i < pdfData.numpages; i++) {
const pageNumber = i + 1;
// Skip pages outside range if specified
if (pageRange && (pageNumber < pageRange.start || pageNumber > pageRange.end)) {
continue;
}
const startPosition = i * estimatedPageLength;
const endPosition = Math.min((i + 1) * estimatedPageLength, fullText.length);
const pageText = fullText.slice(startPosition, endPosition);
pages.push({
pageNumber,
text: pageText,
startPosition,
endPosition
});
}
} catch (error) {
console.warn('Could not extract individual pages, using full text');
}
return pages;
}
/**
* Filter text by page range
*/
private filterByPageRange(pages: PDFPageInfo[], pageRange: { start: number; end: number }): string {
return pages
.filter(page => page.pageNumber >= pageRange.start && page.pageNumber <= pageRange.end)
.map(page => page.text)
.join('\n\n');
}
/**
* Clean and normalize PDF text
*/
private cleanText(text: string): string {
return text
// Remove excessive whitespace
.replace(/\s+/g, ' ')
// Fix common PDF extraction issues
.replace(/([a-z])([A-Z])/g, '$1 $2') // Add space between camelCase
.replace(/(\w)(\d)/g, '$1 $2') // Add space between word and number
.replace(/(\d)(\w)/g, '$1 $2') // Add space between number and word
// Normalize line endings
.replace(/\r\n/g, '\n')
.replace(/\r/g, '\n')
// Remove excessive line breaks
.replace(/\n{3,}/g, '\n\n')
// Trim
.trim();
}
/**
* Extract headings from PDF text
*/
private extractHeadings(text: string, _pages: PDFPageInfo[]): Heading[] {
const headings: Heading[] = [];
const lines = text.split('\n');
lines.forEach((line, _index) => {
const trimmed = line.trim();
// Skip empty lines or very short lines
if (trimmed.length < 3 || trimmed.length > 100) return;
// Check for common heading patterns in PDFs
const patterns = [
// All caps (likely heading)
/^[A-Z\s\d\.\-]{3,50}$/,
// Numbered sections
/^\d+\.?\s+[A-Z][a-zA-Z\s]+$/,
// Chapter/Section indicators
/^(Chapter|Section|Part)\s+\d+/i,
// Lines that are followed by content (heuristic)
/^[A-Z][a-zA-Z\s]{5,50}$/
];
let isHeading = false;
let level = 3; // Default level
for (const pattern of patterns) {
if (pattern.test(trimmed)) {
isHeading = true;
// Determine level based on pattern
if (trimmed.match(/^(Chapter|Part)/i)) level = 1;
else if (trimmed.match(/^Section/i)) level = 2;
else if (trimmed.match(/^\d+\./)) level = 2;
else if (trimmed === trimmed.toUpperCase()) level = 2;
break;
}
}
// Additional heuristic: check if next line has content
if (isHeading && _index < lines.length - 1) {
const nextLine = lines[_index + 1].trim();
if (nextLine.length > 20) {
headings.push({
level,
text: trimmed,
position: text.indexOf(line)
});
}
}
});
return headings;
}
/**
* Detect table-like structures in text
*/
private detectTables(text: string): TableInfo[] {
const tables: TableInfo[] = [];
const lines = text.split('\n');
let inTable = false;
let tableStart = -1;
let currentTable: string[] = [];
lines.forEach((line, _index) => {
const trimmed = line.trim();
// Simple heuristic: lines with multiple spaces or tabs might be table rows
const hasMultipleSpaces = /\s{3,}/.test(trimmed);
const hasTabularData = /\d+\s+\w+\s+\d+/.test(trimmed);
if ((hasMultipleSpaces || hasTabularData) && trimmed.length > 20) {
if (!inTable) {
inTable = true;
tableStart = text.indexOf(line);
currentTable = [];
}
currentTable.push(trimmed);
} else if (inTable && currentTable.length >= 2) {
// End of table
const columns = Math.max(...currentTable.map(row => row.split(/\s{2,}/).length));
tables.push({
rows: currentTable.length,
columns,
position: tableStart
});
inTable = false;
currentTable = [];
} else if (inTable) {
inTable = false;
currentTable = [];
}
});
return tables;
}
/**
* Analyze document structure
*/
private analyzeStructure(
text: string,
headings: Heading[],
pages: PDFPageInfo[],
preservePageBreaks?: boolean
): DocumentStructure {
const sections = headings.map((heading, index) => {
const nextHeading = headings[index + 1];
const endPosition = nextHeading ? nextHeading.position : text.length;
return {
title: heading.text,
level: heading.level,
startPosition: heading.position,
endPosition,
content: text.slice(heading.position, endPosition).trim()
};
});
const toc = headings.map(heading => {
// Try to determine page number for heading
const page = pages.find(p =>
heading.position >= p.startPosition && heading.position <= p.endPosition
);
return {
title: heading.text,
level: heading.level,
page: page?.pageNumber || 1,
position: heading.position
};
});
// Page breaks are the boundaries between pages
const pageBreaks = preservePageBreaks ?
pages.map(page => page.endPosition).slice(0, -1) : [];
return {
sections,
toc,
pageBreaks
};
}
/**
* Create content chunks with page tracking
*/
private createChunks(text: string, pages: PDFPageInfo[], chunkSize: number): ContentChunk[] {
const chunks: ContentChunk[] = [];
const words = text.split(/\s+/);
let currentChunk = '';
let chunkIndex = 0;
let startPosition = 0;
for (let i = 0; i < words.length; i++) {
const word = words[i];
const testChunk = currentChunk + (currentChunk ? ' ' : '') + word;
if (testChunk.length > chunkSize && currentChunk.length > 0) {
// Determine page range for chunk
const chunkStart = text.indexOf(currentChunk, startPosition);
const chunkEnd = chunkStart + currentChunk.length;
const startPage = pages.find(p => chunkStart >= p.startPosition && chunkStart <= p.endPosition);
const endPage = pages.find(p => chunkEnd >= p.startPosition && chunkEnd <= p.endPosition);
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
startPage: startPage?.pageNumber || 1,
endPage: endPage?.pageNumber || startPage?.pageNumber || 1,
metadata: {
wordCount: currentChunk.split(/\s+/).length,
position: { start: chunkStart, end: chunkEnd }
}
});
// Start new chunk
currentChunk = word;
startPosition = chunkEnd;
chunkIndex++;
} else {
currentChunk = testChunk;
}
}
// Add final chunk
if (currentChunk.trim()) {
const chunkStart = text.indexOf(currentChunk, startPosition);
const chunkEnd = chunkStart + currentChunk.length;
const startPage = pages.find(p => chunkStart >= p.startPosition && chunkStart <= p.endPosition);
const endPage = pages.find(p => chunkEnd >= p.startPosition && chunkEnd <= p.endPosition);
chunks.push({
id: `chunk_${chunkIndex}`,
content: currentChunk.trim(),
startPage: startPage?.pageNumber || 1,
endPage: endPage?.pageNumber || startPage?.pageNumber || 1,
metadata: {
wordCount: currentChunk.split(/\s+/).length,
position: { start: chunkStart, end: chunkEnd }
}
});
}
return chunks;
}
/**
* Convert PDF text to markdown format
*/
private convertToMarkdown(
text: string,
pages: PDFPageInfo[],
headings: Heading[],
options: PDFProcessorOptions
): string {
let markdown = text;
// Convert detected headings to markdown format
headings.forEach(heading => {
const headingText = heading.text;
const markdownHeading = '#'.repeat(Math.min(heading.level, 6)) + ' ' + headingText;
markdown = markdown.replace(headingText, markdownHeading);
});
// Add page breaks if requested
if (options.preservePageBreaks && pages.length > 1) {
pages.forEach((page, index) => {
if (index > 0) {
const pageBreakMarker = `\n\n---\n*Page ${page.pageNumber}*\n\n`;
const insertPosition = page.startPosition;
markdown = markdown.slice(0, insertPosition) + pageBreakMarker + markdown.slice(insertPosition);
}
});
}
return markdown;
}
/**
* Count words in text
*/
private countWords(text: string): number {
return text.trim().split(/\s+/).filter(word => word.length > 0).length;
}
/**
* Simple language detection
*/
private detectLanguage(text: string): string {
const sample = text.slice(0, 1000).toLowerCase();
const englishWords = ['the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'];
const englishCount = englishWords.reduce((count, word) => {
return count + (sample.match(new RegExp(`\\b${word}\\b`, 'g')) || []).length;
}, 0);
return englishCount > 5 ? 'en' : 'unknown';
}
}