pdf_extractor_agent.js•7.49 kB
/**
* PDF Extractor Agent
* Extracts text and structured data from PDF documents
*/
const BaseAgent = require('../base_agent');
const fs = require('fs').promises;
const path = require('path');
const crypto = require('crypto');
class PDFExtractorAgent extends BaseAgent {
/**
* Constructor for the PDFExtractorAgent
* @param {Object} options - Configuration options
*/
constructor(options = {}) {
super({
name: 'PDFExtractorAgent',
description: 'Extracts text and structured data from PDF documents',
version: '1.0.0',
...options
});
this.tempDir = options.tempDir || 'temp';
this.pdfParser = options.pdfParser || null;
}
/**
* Process a PDF document
* @param {Object} input - Input containing PDF data
* @param {Object} context - Processing context
* @returns {Promise<Object>} - Extraction results
*/
async _process(input, context) {
this.logger.info(`[${this.name}] Processing PDF document`);
// Extract PDF data from input
const pdfData = await this.extractPDFData(input);
// Parse the PDF to extract text
const extractedText = await this.extractText(pdfData);
// Extract structured data (tables, forms, etc.)
const structuredData = await this.extractStructuredData(pdfData, extractedText);
// Extract metadata from the PDF
const metadata = await this.extractMetadata(pdfData);
// Analyze the content
const analysis = await this.analyzeContent(extractedText, structuredData, metadata);
return {
extracted_text: extractedText,
structured_data: structuredData,
metadata: metadata,
analysis: analysis
};
}
/**
* Extract PDF data from the input
* @param {Object} input - Input object
* @returns {Promise<Buffer>} - PDF data as a buffer
*/
async extractPDFData(input) {
// Handle different input formats
if (input.file && input.file.data) {
// Direct file data
return Buffer.from(input.file.data);
} else if (input.file && input.file.path) {
// File path
return await fs.readFile(input.file.path);
} else if (input.url) {
// URL to a PDF
throw new Error('URL-based PDF extraction not implemented yet');
} else if (input.text) {
// This is already extracted text, no need to process
return null;
} else {
throw new Error('Invalid input: No PDF data, file path, or URL provided');
}
}
/**
* Extract text from PDF data
* @param {Buffer} pdfData - PDF data as a buffer
* @returns {Promise<string>} - Extracted text
*/
async extractText(pdfData) {
if (!pdfData) {
return '';
}
// If we have a custom PDF parser, use it
if (this.pdfParser) {
return await this.pdfParser.extractText(pdfData);
}
// Otherwise, use a mock implementation
// In a real implementation, you would use a library like pdf.js or pdfparse
this.logger.warn(`[${this.name}] Using mock PDF text extraction`);
// Save the PDF to a temporary file for processing
const tempFilePath = await this.saveTempFile(pdfData, 'pdf');
// Mock extraction - in a real implementation, you would use a PDF parsing library
const mockText = `This is mock extracted text from a PDF document.
Document Title: Sample Document
Author: BlackHole System
Created: ${new Date().toISOString()}
Pages: 5
Section 1: Introduction
This document demonstrates the PDF extraction capabilities of the BlackHole system.
Section 2: Content
The system can extract text, tables, forms, and other structured data from PDF documents.
Section 3: Conclusion
The extracted data can be stored in MongoDB and queried by users.`;
// Clean up the temporary file
try {
await fs.unlink(tempFilePath);
} catch (error) {
this.logger.error(`[${this.name}] Error deleting temporary file: ${error.message}`);
}
return mockText;
}
/**
* Extract structured data from PDF
* @param {Buffer} pdfData - PDF data
* @param {string} extractedText - Already extracted text
* @returns {Promise<Object>} - Structured data
*/
async extractStructuredData(pdfData, extractedText) {
if (!pdfData) {
return {};
}
// Mock implementation - in a real implementation, you would use a PDF parsing library
// that can extract tables, forms, etc.
return {
tables: [
{
title: "Sample Table",
headers: ["ID", "Name", "Value"],
rows: [
["1", "Item 1", "100"],
["2", "Item 2", "200"],
["3", "Item 3", "300"]
]
}
],
forms: [
{
name: "Sample Form",
fields: [
{ name: "firstName", value: "John" },
{ name: "lastName", value: "Doe" },
{ name: "email", value: "john.doe@example.com" }
]
}
]
};
}
/**
* Extract metadata from PDF
* @param {Buffer} pdfData - PDF data
* @returns {Promise<Object>} - Metadata
*/
async extractMetadata(pdfData) {
if (!pdfData) {
return {};
}
// Mock implementation - in a real implementation, you would extract actual metadata
return {
title: "Sample Document",
author: "BlackHole System",
creator: "PDF Creator",
producer: "PDF Library",
creationDate: new Date().toISOString(),
modificationDate: new Date().toISOString(),
keywords: ["sample", "document", "pdf"],
pageCount: 5
};
}
/**
* Analyze the content of the PDF
* @param {string} text - Extracted text
* @param {Object} structuredData - Structured data
* @param {Object} metadata - PDF metadata
* @returns {Promise<Object>} - Analysis results
*/
async analyzeContent(text, structuredData, metadata) {
// Mock implementation - in a real implementation, you would use NLP or other analysis
return {
summary: "This document contains information about the BlackHole system's PDF extraction capabilities.",
language: "en",
wordCount: text.split(/\s+/).length,
entities: [
{ type: "ORGANIZATION", text: "BlackHole System", count: 1 },
{ type: "CONCEPT", text: "PDF extraction", count: 2 }
],
sentiment: {
score: 0.2,
magnitude: 0.5,
label: "neutral"
}
};
}
/**
* Save data to a temporary file
* @param {Buffer} data - Data to save
* @param {string} extension - File extension
* @returns {Promise<string>} - Path to the temporary file
*/
async saveTempFile(data, extension) {
// Create temp directory if it doesn't exist
try {
await fs.mkdir(this.tempDir, { recursive: true });
} catch (error) {
if (error.code !== 'EEXIST') {
throw error;
}
}
// Generate a unique filename
const filename = `${Date.now()}_${crypto.randomBytes(8).toString('hex')}.${extension}`;
const filePath = path.join(this.tempDir, filename);
// Write the data to the file
await fs.writeFile(filePath, data);
return filePath;
}
/**
* Validate the input
* @param {Object} input - Input to validate
*/
validateInput(input) {
super.validateInput(input);
if (!input.file && !input.url && !input.text) {
throw new Error('Input must contain file, url, or text');
}
}
}
module.exports = PDFExtractorAgent;