Skip to main content
Glama

MCP Orchestration Server

pdf_extractor_agent.js7.49 kB
/** * PDF Extractor Agent * Extracts text and structured data from PDF documents */ const BaseAgent = require('../base_agent'); const fs = require('fs').promises; const path = require('path'); const crypto = require('crypto'); class PDFExtractorAgent extends BaseAgent { /** * Constructor for the PDFExtractorAgent * @param {Object} options - Configuration options */ constructor(options = {}) { super({ name: 'PDFExtractorAgent', description: 'Extracts text and structured data from PDF documents', version: '1.0.0', ...options }); this.tempDir = options.tempDir || 'temp'; this.pdfParser = options.pdfParser || null; } /** * Process a PDF document * @param {Object} input - Input containing PDF data * @param {Object} context - Processing context * @returns {Promise<Object>} - Extraction results */ async _process(input, context) { this.logger.info(`[${this.name}] Processing PDF document`); // Extract PDF data from input const pdfData = await this.extractPDFData(input); // Parse the PDF to extract text const extractedText = await this.extractText(pdfData); // Extract structured data (tables, forms, etc.) const structuredData = await this.extractStructuredData(pdfData, extractedText); // Extract metadata from the PDF const metadata = await this.extractMetadata(pdfData); // Analyze the content const analysis = await this.analyzeContent(extractedText, structuredData, metadata); return { extracted_text: extractedText, structured_data: structuredData, metadata: metadata, analysis: analysis }; } /** * Extract PDF data from the input * @param {Object} input - Input object * @returns {Promise<Buffer>} - PDF data as a buffer */ async extractPDFData(input) { // Handle different input formats if (input.file && input.file.data) { // Direct file data return Buffer.from(input.file.data); } else if (input.file && input.file.path) { // File path return await fs.readFile(input.file.path); } else if (input.url) { // URL to a PDF throw new Error('URL-based PDF extraction not implemented yet'); } else if (input.text) { // This is already extracted text, no need to process return null; } else { throw new Error('Invalid input: No PDF data, file path, or URL provided'); } } /** * Extract text from PDF data * @param {Buffer} pdfData - PDF data as a buffer * @returns {Promise<string>} - Extracted text */ async extractText(pdfData) { if (!pdfData) { return ''; } // If we have a custom PDF parser, use it if (this.pdfParser) { return await this.pdfParser.extractText(pdfData); } // Otherwise, use a mock implementation // In a real implementation, you would use a library like pdf.js or pdfparse this.logger.warn(`[${this.name}] Using mock PDF text extraction`); // Save the PDF to a temporary file for processing const tempFilePath = await this.saveTempFile(pdfData, 'pdf'); // Mock extraction - in a real implementation, you would use a PDF parsing library const mockText = `This is mock extracted text from a PDF document. Document Title: Sample Document Author: BlackHole System Created: ${new Date().toISOString()} Pages: 5 Section 1: Introduction This document demonstrates the PDF extraction capabilities of the BlackHole system. Section 2: Content The system can extract text, tables, forms, and other structured data from PDF documents. Section 3: Conclusion The extracted data can be stored in MongoDB and queried by users.`; // Clean up the temporary file try { await fs.unlink(tempFilePath); } catch (error) { this.logger.error(`[${this.name}] Error deleting temporary file: ${error.message}`); } return mockText; } /** * Extract structured data from PDF * @param {Buffer} pdfData - PDF data * @param {string} extractedText - Already extracted text * @returns {Promise<Object>} - Structured data */ async extractStructuredData(pdfData, extractedText) { if (!pdfData) { return {}; } // Mock implementation - in a real implementation, you would use a PDF parsing library // that can extract tables, forms, etc. return { tables: [ { title: "Sample Table", headers: ["ID", "Name", "Value"], rows: [ ["1", "Item 1", "100"], ["2", "Item 2", "200"], ["3", "Item 3", "300"] ] } ], forms: [ { name: "Sample Form", fields: [ { name: "firstName", value: "John" }, { name: "lastName", value: "Doe" }, { name: "email", value: "john.doe@example.com" } ] } ] }; } /** * Extract metadata from PDF * @param {Buffer} pdfData - PDF data * @returns {Promise<Object>} - Metadata */ async extractMetadata(pdfData) { if (!pdfData) { return {}; } // Mock implementation - in a real implementation, you would extract actual metadata return { title: "Sample Document", author: "BlackHole System", creator: "PDF Creator", producer: "PDF Library", creationDate: new Date().toISOString(), modificationDate: new Date().toISOString(), keywords: ["sample", "document", "pdf"], pageCount: 5 }; } /** * Analyze the content of the PDF * @param {string} text - Extracted text * @param {Object} structuredData - Structured data * @param {Object} metadata - PDF metadata * @returns {Promise<Object>} - Analysis results */ async analyzeContent(text, structuredData, metadata) { // Mock implementation - in a real implementation, you would use NLP or other analysis return { summary: "This document contains information about the BlackHole system's PDF extraction capabilities.", language: "en", wordCount: text.split(/\s+/).length, entities: [ { type: "ORGANIZATION", text: "BlackHole System", count: 1 }, { type: "CONCEPT", text: "PDF extraction", count: 2 } ], sentiment: { score: 0.2, magnitude: 0.5, label: "neutral" } }; } /** * Save data to a temporary file * @param {Buffer} data - Data to save * @param {string} extension - File extension * @returns {Promise<string>} - Path to the temporary file */ async saveTempFile(data, extension) { // Create temp directory if it doesn't exist try { await fs.mkdir(this.tempDir, { recursive: true }); } catch (error) { if (error.code !== 'EEXIST') { throw error; } } // Generate a unique filename const filename = `${Date.now()}_${crypto.randomBytes(8).toString('hex')}.${extension}`; const filePath = path.join(this.tempDir, filename); // Write the data to the file await fs.writeFile(filePath, data); return filePath; } /** * Validate the input * @param {Object} input - Input to validate */ validateInput(input) { super.validateInput(input); if (!input.file && !input.url && !input.text) { throw new Error('Input must contain file, url, or text'); } } } module.exports = PDFExtractorAgent;

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Nisarg-123-web/MCP2'

If you have feedback or need assistance with the MCP directory API, please join our Discord server