Skip to main content
Glama

PDFtotext MCP Server

by jpwebb
server.jsโ€ข8.42 kB
#!/usr/bin/env node /** * PDFtotext MCP Server * * A reliable Model Context Protocol server for PDF text extraction using pdftotext. * * @author Jason Webb * @license MIT */ const { Server } = require('@modelcontextprotocol/sdk/server/index.js'); const { StdioServerTransport } = require('@modelcontextprotocol/sdk/server/stdio.js'); const { CallToolRequestSchema, ListToolsRequestSchema } = require('@modelcontextprotocol/sdk/types.js'); const { execSync, spawn } = require('child_process'); const fs = require('fs'); const path = require('path'); /** * PDFtotext MCP Server Class * * Provides PDF text extraction capabilities through the Model Context Protocol * using the reliable pdftotext utility from poppler-utils. */ class PDFtotextServer { constructor() { this.server = new Server( { name: 'pdftotext-mcp', version: '1.0.0', description: 'A reliable MCP server for PDF text extraction using pdftotext', }, { capabilities: { tools: {}, }, } ); this.setupToolHandlers(); this.setupErrorHandling(); } /** * Set up error handling and graceful shutdown */ setupErrorHandling() { this.server.onerror = (error) => { console.error('[PDFtotext MCP Error]', error); }; // Handle graceful shutdown process.on('SIGINT', async () => { console.error('Shutting down PDFtotext MCP server...'); await this.server.close(); process.exit(0); }); process.on('SIGTERM', async () => { console.error('Shutting down PDFtotext MCP server...'); await this.server.close(); process.exit(0); }); } /** * Set up MCP tool handlers */ setupToolHandlers() { // List available tools this.server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [ { name: 'read_pdf_text', description: 'Extract text content from a PDF file using pdftotext from poppler-utils', inputSchema: { type: 'object', properties: { path: { type: 'string', description: 'Path to the PDF file (relative to current working directory or absolute path)', }, page: { type: 'number', description: 'Specific page number to extract (1-based indexing). If not specified, extracts all pages.', minimum: 1, }, layout: { type: 'boolean', description: 'Preserve original text layout formatting (default: false)', default: false, }, encoding: { type: 'string', description: 'Text encoding for output (default: UTF-8)', default: 'UTF-8', enum: ['UTF-8', 'Latin1', 'ASCII'], }, }, required: ['path'], }, }, ], })); // Handle tool calls this.server.setRequestHandler(CallToolRequestSchema, async (request) => { if (request.params.name !== 'read_pdf_text') { throw new Error(`Unknown tool: ${request.params.name}`); } return await this.handleReadPdfText(request.params.arguments); }); } /** * Check if pdftotext is available on the system */ checkPdftotextAvailable() { try { execSync('pdftotext -v', { stdio: 'pipe' }); return true; } catch (error) { return false; } } /** * Validate PDF file path and accessibility */ validatePdfFile(filePath) { // Check if file exists if (!fs.existsSync(filePath)) { throw new Error(`File not found: ${filePath}`); } // Check if file is readable try { fs.accessSync(filePath, fs.constants.R_OK); } catch (error) { throw new Error(`File is not readable: ${filePath}`); } // Basic PDF file validation (check extension and magic bytes) if (!filePath.toLowerCase().endsWith('.pdf')) { throw new Error(`File does not appear to be a PDF: ${filePath}`); } try { const buffer = fs.readFileSync(filePath, { start: 0, end: 4 }); if (!buffer.toString().startsWith('%PDF')) { throw new Error(`File is not a valid PDF (missing PDF header): ${filePath}`); } } catch (error) { if (error.message.includes('PDF header')) { throw error; } throw new Error(`Unable to validate PDF file: ${filePath}`); } } /** * Handle PDF text extraction */ async handleReadPdfText(args) { try { const { path: filePath, page, layout = false, encoding = 'UTF-8' } = args; // Check if pdftotext is available if (!this.checkPdftotextAvailable()) { throw new Error( 'pdftotext is not available. Please install poppler-utils:\n' + ' Ubuntu/Debian: sudo apt install poppler-utils\n' + ' macOS: brew install poppler\n' + ' Windows: choco install poppler' ); } // Validate the PDF file this.validatePdfFile(filePath); // Build pdftotext command const args_array = ['pdftotext']; // Add encoding if specified if (encoding !== 'UTF-8') { args_array.push('-enc', encoding); } // Add layout preservation if requested if (layout) { args_array.push('-layout'); } // Add page specification if provided if (page) { args_array.push('-f', page.toString(), '-l', page.toString()); } // Add input file and output to stdout args_array.push(filePath, '-'); // Execute pdftotext const text = execSync(args_array.join(' '), { encoding: 'utf8', maxBuffer: 50 * 1024 * 1024, // 50MB buffer for very large PDFs timeout: 30000, // 30 second timeout }); // Get file metadata const stats = fs.statSync(filePath); const fileName = path.basename(filePath); const fileDir = path.dirname(path.resolve(filePath)); // Prepare response const response = { success: true, file: fileName, path: path.resolve(filePath), directory: fileDir, extractedText: text.trim(), pageSpecific: page || 'all', layoutPreserved: layout, encoding: encoding, fileSize: stats.size, lastModified: stats.mtime.toISOString(), extractedAt: new Date().toISOString(), textLength: text.trim().length, wordCount: text.trim().split(/\s+/).filter(word => word.length > 0).length, }; return { content: [ { type: 'text', text: JSON.stringify(response, null, 2), }, ], }; } catch (error) { // Prepare error response const errorResponse = { success: false, error: error.message, file: args.path || 'unknown', timestamp: new Date().toISOString(), }; // Add specific error context if available if (error.code === 'ENOENT') { errorResponse.errorType = 'FILE_NOT_FOUND'; } else if (error.code === 'EACCES') { errorResponse.errorType = 'PERMISSION_DENIED'; } else if (error.message.includes('pdftotext')) { errorResponse.errorType = 'PDFTOTEXT_ERROR'; } else if (error.message.includes('PDF')) { errorResponse.errorType = 'INVALID_PDF'; } else { errorResponse.errorType = 'UNKNOWN_ERROR'; } return { content: [ { type: 'text', text: JSON.stringify(errorResponse, null, 2), }, ], }; } } /** * Start the MCP server */ async run() { const transport = new StdioServerTransport(); await this.server.connect(transport); // Log to stderr only (stdout is reserved for MCP JSON-RPC) console.error('PDFtotext MCP server v1.0.0 running on stdio'); console.error('Ready to process PDF text extraction requests...'); } } // Start the server if this script is run directly if (require.main === module) { const server = new PDFtotextServer(); server.run().catch((error) => { console.error('Failed to start PDFtotext MCP server:', error); process.exit(1); }); } module.exports = { PDFtotextServer };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jpwebb/pdftotext-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server