PDF Agent MCP

Overview Schema Related Servers Score Discussions

pdf-agent-mcp
server

index.js

index.js•99.8 KiB

#!/usr/bin/env node /** * PDF Agent MCP Server * * A Model Context Protocol server for dynamic PDF content extraction and analysis. * Provides tools for selective PDF content extraction, metadata analysis, and document processing. */ import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import { readFile, stat, mkdir } from "fs/promises"; import { resolve, join, isAbsolute, basename } from "path"; import { homedir } from "os"; import { createWriteStream } from "fs"; import { pipeline } from "stream/promises"; import { PDFDocument } from "pdf-lib"; import * as pdfjsLib from "pdfjs-dist/legacy/build/pdf.mjs"; import { pdfToPng } from "pdf-to-png-converter"; import sharp from "sharp"; // Configure PDF.js worker to use built-in worker // Console suppression in extractTextNative prevents worker warnings from reaching stdout pdfjsLib.GlobalWorkerOptions.workerSrc = 'pdfjs-dist/build/pdf.worker.mjs'; // Override console methods during PDF operations to prevent stdout contamination const originalConsole = { log: console.log, warn: console.warn, error: console.error, info: console.info }; function suppressConsoleOutput() { console.log = () => { }; console.warn = () => { }; console.error = () => { }; console.info = () => { }; } function restoreConsoleOutput() { console.log = originalConsole.log; console.warn = originalConsole.warn; console.error = originalConsole.error; console.info = originalConsole.info; } // Enhanced logging for MCP environment - write to stderr to avoid corrupting JSON-RPC stdout function log(level, message, data) { const timestamp = new Date().toISOString(); const logMessage = `[${timestamp}] [${level.toUpperCase()}] PDF-Agent-MCP: ${message}`; if (data) { process.stderr.write(`${logMessage} ${JSON.stringify(data)}\n`); } else { process.stderr.write(`${logMessage}\n`); } } /** * Check if a file exists */ async function fileExists(filePath) { try { await stat(filePath); return true; } catch { return false; } } /** * Get the PDF agent home directory path */ function getPdfAgentHome() { return join(homedir(), 'pdf-agent'); } /** * Ensure the PDF agent home directory exists */ async function ensurePdfAgentHome() { const pdfAgentHome = getPdfAgentHome(); try { await mkdir(pdfAgentHome, { recursive: true }); } catch (error) { // Directory might already exist, which is fine if (error instanceof Error && error.code !== 'EEXIST') { throw new Error(`Failed to create PDF agent home directory at ${pdfAgentHome}: ${error.message}`); } } return pdfAgentHome; } // Define Zod schemas for tool inputs const GetPdfMetadataSchema = z.object({ absolute_path: z.string().optional(), relative_path: z.string().optional(), use_pdf_home: z.boolean().default(true), }).refine((data) => (data.absolute_path && !data.relative_path) || (!data.absolute_path && data.relative_path), { message: "Exactly one of 'absolute_path' or 'relative_path' must be provided", }); const GetPdfTextSchema = z.object({ absolute_path: z.string().optional(), relative_path: z.string().optional(), use_pdf_home: z.boolean().default(true), page_range: z.string().default("1:"), extraction_strategy: z.enum(["hybrid", "native"]).default("hybrid"), preserve_formatting: z.boolean().default(true), line_breaks: z.boolean().default(true), }).refine((data) => (data.absolute_path && !data.relative_path) || (!data.absolute_path && data.relative_path), { message: "Exactly one of 'absolute_path' or 'relative_path' must be provided", }); const GetPdfImagesSchema = z.object({ absolute_path: z.string().optional(), relative_path: z.string().optional(), use_pdf_home: z.boolean().default(true), page_range: z.string().default("1:"), format: z.enum(["png", "jpeg"]).default("jpeg"), quality: z.coerce.number().min(1).max(100).default(85), max_width: z.coerce.number().min(100).max(3000).optional(), max_height: z.coerce.number().min(100).max(3000).optional(), }).refine((data) => (data.absolute_path && !data.relative_path) || (!data.absolute_path && data.relative_path), { message: "Exactly one of 'absolute_path' or 'relative_path' must be provided", }); const SearchPdfSchema = z.object({ absolute_path: z.string().optional(), relative_path: z.string().optional(), use_pdf_home: z.boolean().default(true), page_range: z.string().default("1:"), search_pattern: z.string().min(1), max_results: z.coerce.number().min(1).optional(), max_pages_scanned: z.coerce.number().min(1).optional(), context_chars: z.coerce.number().min(10).max(1000).default(150), search_timeout: z.coerce.number().min(1000).max(60000).default(10000), }).refine((data) => (data.absolute_path && !data.relative_path) || (!data.absolute_path && data.relative_path), { message: "Exactly one of 'absolute_path' or 'relative_path' must be provided", }); const GetPdfOutlineSchema = z.object({ absolute_path: z.string().optional(), relative_path: z.string().optional(), use_pdf_home: z.boolean().default(true), include_destinations: z.boolean().default(true), max_depth: z.coerce.number().min(1).max(10).optional(), flatten_structure: z.boolean().default(false), }).refine((data) => (data.absolute_path && !data.relative_path) || (!data.absolute_path && data.relative_path), { message: "Exactly one of 'absolute_path' or 'relative_path' must be provided", }); const DownloadPdfSchema = z.object({ url: z.string().url(), subfolder: z.string().default("downloads"), filename: z.string().optional(), }); const SearchMultiplePdfsSchema = z.object({ files: z.array(z.object({ absolute_path: z.string().optional(), relative_path: z.string().optional(), use_pdf_home: z.boolean().default(true), }).refine((data) => (data.absolute_path && !data.relative_path) || (!data.absolute_path && data.relative_path), { message: "Exactly one of 'absolute_path' or 'relative_path' must be provided for each file" })).min(1), search_pattern: z.string().min(1), parallelism: z.coerce.number().min(1).max(50).default(4), page_range: z.string().default("1:"), max_results_per_file: z.coerce.number().min(1).optional(), max_pages_scanned_per_file: z.coerce.number().min(1).optional(), context_chars: z.coerce.number().min(10).max(1000).default(150), search_timeout: z.coerce.number().min(1000).max(60000).default(10000), }); // Configuration constants const MAX_FILE_SIZE = 400 * 1024 * 1024; // 400MB limit for PDF files const OPERATION_TIMEOUT = 30000; // 30 second timeout for operations /** * Parse page range string into array of page numbers (1-indexed) * Supports formats: * - Single pages: "5" → [5] * - Ranges: "5:10" → [5,6,7,8,9,10] * - Open ranges: "7:" (from 7 to end), ":5" (from start to 5) * - Comma-separated combinations: "1,3:5,7,10:" → [1,3,4,5,7,10,11,...] * - Complex mixed: "1-3,7,8:10" → [1,2,3,7,8,9,10] */ function parsePageRange(rangeStr, totalPages) { const range = rangeStr.trim(); if (!range) { throw new Error("Page range cannot be empty"); } // Split by commas to handle multiple segments const segments = range.split(',').map(seg => seg.trim()).filter(seg => seg.length > 0); if (segments.length === 0) { throw new Error("Page range cannot be empty after parsing"); } const allPages = new Set(); // Process each segment for (const segment of segments) { try { const segmentPages = parseSinglePageRange(segment, totalPages); for (const page of segmentPages) { allPages.add(page); } } catch (error) { throw new Error(`Invalid segment '${segment}': ${error instanceof Error ? error.message : 'Unknown error'}`); } } // Convert to sorted array return Array.from(allPages).sort((a, b) => a - b); } /** * Parse a single page range segment (no commas) * Supports: "5", "5:10", "7:", ":5" */ function parseSinglePageRange(segment, totalPages) { const trimmed = segment.trim(); if (!trimmed) { throw new Error("Page range segment cannot be empty"); } // Single page: "5" if (!trimmed.includes(':')) { const pageNum = parseInt(trimmed, 10); if (isNaN(pageNum) || pageNum < 1 || pageNum > totalPages) { throw new Error(`Invalid page number: ${trimmed}. Must be between 1 and ${totalPages}`); } return [pageNum]; } // Range with colon: "5:10", "7:", ":5" const parts = trimmed.split(':'); if (parts.length !== 2) { throw new Error(`Invalid page range format: ${trimmed}. Use formats like "5", "5:10", "7:", or ":5"`); } let start = 1; let end = totalPages; if (parts[0].trim()) { start = parseInt(parts[0].trim(), 10); if (isNaN(start) || start < 1) { throw new Error(`Invalid start page: ${parts[0]}. Must be a positive number`); } } if (parts[1].trim()) { end = parseInt(parts[1].trim(), 10); if (isNaN(end) || end < 1) { throw new Error(`Invalid end page: ${parts[1]}. Must be a positive number`); } } if (start > end) { throw new Error(`Invalid page range: start page ${start} is greater than end page ${end}`); } if (start > totalPages) { throw new Error(`Start page ${start} exceeds document length of ${totalPages} pages`); } if (end > totalPages) { end = totalPages; } const pages = []; for (let i = start; i <= end; i++) { pages.push(i); } return pages; } /** * Parse search pattern and create RegExp object * Supports /pattern/flags format or plain text */ function parseSearchPattern(pattern) { // Check if pattern is in /pattern/flags format if (pattern.startsWith('/') && pattern.lastIndexOf('/') > 0) { const lastSlash = pattern.lastIndexOf('/'); const regexPattern = pattern.slice(1, lastSlash); const flags = pattern.slice(lastSlash + 1); // Validate flags const validFlags = /^[gimsuvy]*$/; if (!validFlags.test(flags)) { throw new Error(`Invalid regex flags: ${flags}. Valid flags are g, i, m, s, u, v, y`); } try { return { regex: new RegExp(regexPattern, flags), isRegex: true }; } catch (error) { throw new Error(`Invalid regex pattern: ${error instanceof Error ? error.message : 'Unknown error'}`); } } else { // Treat as literal string - escape special regex characters const escapedPattern = pattern.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); return { regex: new RegExp(escapedPattern, 'gi'), isRegex: false }; } } /** * Extract context snippet around a match */ function extractContext(text, matchStart, matchEnd, contextChars) { const start = Math.max(0, matchStart - contextChars); const end = Math.min(text.length, matchEnd + contextChars); const snippet = text.slice(start, end); return { snippet, matchStartInSnippet: matchStart - start, matchEndInSnippet: matchEnd - start, }; } /** * Search for pattern in text with timeout protection */ async function searchWithTimeout(text, regex, timeoutMs) { return new Promise((resolve, reject) => { const timeout = setTimeout(() => { reject(new Error(`Search timed out after ${timeoutMs}ms - pattern may be too complex`)); }, timeoutMs); try { const matches = []; let match; // Reset regex lastIndex to ensure consistent behavior regex.lastIndex = 0; while ((match = regex.exec(text)) !== null) { matches.push(match); // Prevent infinite loop on zero-width matches if (match.index === regex.lastIndex) { regex.lastIndex++; } // Safety check to prevent runaway regex if (matches.length > 10000) { throw new Error('Too many matches found (>10000) - pattern may be too broad'); } } clearTimeout(timeout); resolve(matches); } catch (error) { clearTimeout(timeout); reject(error); } }); } /** * Extract text from PDF using native PDF.js method */ async function extractTextNative(pdfBuffer, pageNumbers) { let pdfDoc; // Suppress console output during PDF.js operations to prevent stdout contamination suppressConsoleOutput(); try { pdfDoc = await pdfjsLib.getDocument({ data: new Uint8Array(pdfBuffer) }).promise; } finally { restoreConsoleOutput(); } const texts = []; for (const pageNum of pageNumbers) { try { // Suppress console output for each page operation as well suppressConsoleOutput(); let page; let textContent; try { page = await pdfDoc.getPage(pageNum); textContent = await page.getTextContent(); } finally { restoreConsoleOutput(); } const textItems = textContent.items; // Combine text items with spacing let pageText = ''; for (let i = 0; i < textItems.length; i++) { const item = textItems[i]; if (item.str) { pageText += item.str; // Add space if next item is on same line but has gap if (i < textItems.length - 1) { const nextItem = textItems[i + 1]; if (nextItem.str && item.transform[5] === nextItem.transform[5]) { // Same line, check for gap const gap = nextItem.transform[4] - item.transform[4] - item.width; if (gap > 5) { pageText += ' '; } } else if (nextItem.str) { // Different line, add newline pageText += '\n'; } } } } texts.push(pageText.trim()); } catch (error) { log('warn', `Failed to extract text from page ${pageNum}`, { error }); texts.push(''); } } return texts; } /** * Extract text using hybrid approach (enhanced native extraction with better error handling) */ async function extractTextHybrid(pdfBuffer, pdfPath, pageNumbers) { try { // Use native extraction with enhanced error handling const nativeTexts = await extractTextNative(pdfBuffer, pageNumbers); // Check if pages have very little text (likely scanned PDFs) const results = []; let scannedPageCount = 0; for (let i = 0; i < nativeTexts.length; i++) { const text = nativeTexts[i]; results.push(text); // Count pages with very little text if (text.trim().length < 50) { scannedPageCount++; } } // Log warning if many pages appear to be scanned if (scannedPageCount > 0) { log('warn', `${scannedPageCount} page(s) extracted very little text - may be scanned/image-based content`); } return results; } catch (error) { log('error', 'Text extraction failed', { error }); throw error; } } /** * Convert image buffer to base64 with optimization */ async function imageToBase64(imageBuffer, options) { try { let processor = sharp(imageBuffer); // Get original metadata const originalMetadata = await processor.metadata(); // Apply resizing if specified if (options.maxWidth || options.maxHeight) { processor = processor.resize(options.maxWidth, options.maxHeight, { fit: 'inside', withoutEnlargement: true }); } // Apply format and quality if (options.format === 'jpeg') { processor = processor.jpeg({ quality: options.quality || 85 }); } else { processor = processor.png({ compressionLevel: 6 }); } const processedBuffer = await processor.toBuffer(); const processedMetadata = await sharp(processedBuffer).metadata(); const base64 = processedBuffer.toString('base64'); const mimeType = options.format === 'jpeg' ? 'image/jpeg' : 'image/png'; return { data: base64, mimeType, metadata: { original: { width: originalMetadata.width, height: originalMetadata.height, size: imageBuffer.length }, processed: { width: processedMetadata.width, height: processedMetadata.height, size: processedBuffer.length, format: options.format, quality: options.quality } } }; } catch (error) { throw new Error(`Image processing failed: ${error}`); } } /** * Extract images from PDF pages using pdf-to-png-converter and convert to base64 */ async function extractPdfImages(pdfPath, pageNumbers, options) { const results = []; try { log('info', `Extracting images from PDF pages: ${pageNumbers.join(', ')}`); // Convert PDF to PNG images (extract all pages first, then filter) const pngPages = await pdfToPng(pdfPath, { disableFontFace: false, useSystemFonts: false, viewportScale: 2.0, // High quality scaling pagesToProcess: pageNumbers.length <= 10 ? pageNumbers : undefined // Only specify if reasonable count }); log('info', `Successfully converted ${pngPages.length} pages to PNG`); // Process each requested page for (const pageNum of pageNumbers) { try { // Find the corresponding page in the results const pageData = pngPages.find(p => p.pageNumber === pageNum); if (pageData && pageData.content) { log('info', `Processing image for page ${pageNum}`); // Convert to base64 with optimization const processed = await imageToBase64(pageData.content, options); // Check size limit (1MB for token efficiency) const sizeInMB = processed.metadata.processed.size / (1024 * 1024); if (sizeInMB > 1) { log('warn', `Image for page ${pageNum} is ${sizeInMB.toFixed(2)}MB, consider reducing quality or size`); } results.push({ page: pageNum, image: { type: "image", data: processed.data, mimeType: processed.mimeType }, metadata: { ...processed.metadata, originalDimensions: { width: pageData.width, height: pageData.height } } }); } else { results.push({ page: pageNum, image: null, metadata: null, error: `Page ${pageNum} not found in PDF conversion results` }); } } catch (error) { log('warn', `Image processing failed for page ${pageNum}`, { error }); results.push({ page: pageNum, image: null, metadata: null, error: `Image processing failed: ${error}` }); } } } catch (error) { log('error', 'PDF to PNG conversion failed', { error }); throw new Error(`PDF image extraction failed: ${error}`); } return results; } /** * Search PDF using extract-all-then-search strategy */ async function searchPdfComprehensive(pdfPath, pageNumbers, searchPattern, contextChars, searchTimeout) { const results = []; const errors = []; try { // Extract text from all pages using hybrid approach log('info', `Extracting text from ${pageNumbers.length} pages for comprehensive search`); const pdfBuffer = await safeReadFile(pdfPath); const pageTexts = await extractTextHybrid(pdfBuffer, pdfPath, pageNumbers); // Parse search pattern const { regex } = parseSearchPattern(searchPattern); // Search each page for (let i = 0; i < pageNumbers.length; i++) { const pageNum = pageNumbers[i]; const pageText = pageTexts[i]; if (!pageText || pageText.trim().length === 0) { errors.push(`Page ${pageNum}: No text extracted`); continue; } try { // Search with timeout protection const matches = await searchWithTimeout(pageText, new RegExp(regex.source, regex.flags), searchTimeout); if (matches.length > 0) { const snippets = matches.map(match => { const context = extractContext(pageText, match.index, match.index + match[0].length, contextChars); return { text: context.snippet, matchStart: context.matchStartInSnippet, matchEnd: context.matchEndInSnippet, }; }); results.push({ page: pageNum, matchCount: matches.length, snippets, }); } } catch (searchError) { errors.push(`Page ${pageNum}: Search failed - ${searchError}`); } } return { matches: results, errors, pagesScanned: pageNumbers.length, }; } catch (error) { throw new Error(`Comprehensive search failed: ${error}`); } } /** * Search PDF using page-by-page strategy with early stopping */ async function searchPdfPageByPage(pdfPath, pageNumbers, searchPattern, contextChars, searchTimeout, maxResults, maxPagesScanned) { const results = []; const errors = []; let totalMatchCount = 0; let pagesScanned = 0; // Parse search pattern once const { regex } = parseSearchPattern(searchPattern); log('info', `Starting page-by-page search with limits: max_results=${maxResults}, max_pages=${maxPagesScanned}`); for (const pageNum of pageNumbers) { // Check if we should stop scanning more pages if (maxPagesScanned && pagesScanned >= maxPagesScanned) { return { matches: results, errors, pagesScanned, completed: false, stoppedReason: 'max_pages', }; } pagesScanned++; try { // Extract text from single page using hybrid approach const pdfBuffer = await safeReadFile(pdfPath); const pageTexts = await extractTextHybrid(pdfBuffer, pdfPath, [pageNum]); const pageText = pageTexts[0]; if (!pageText || pageText.trim().length === 0) { errors.push(`Page ${pageNum}: No text extracted`); continue; } // Search with timeout protection const matches = await searchWithTimeout(pageText, new RegExp(regex.source, regex.flags), searchTimeout); if (matches.length > 0) { const snippets = matches.map(match => { const context = extractContext(pageText, match.index, match.index + match[0].length, contextChars); return { text: context.snippet, matchStart: context.matchStartInSnippet, matchEnd: context.matchEndInSnippet, }; }); results.push({ page: pageNum, matchCount: matches.length, snippets, }); totalMatchCount += matches.length; // Check if we've reached max results if (maxResults && totalMatchCount >= maxResults) { return { matches: results, errors, pagesScanned, completed: false, stoppedReason: 'max_results', }; } } } catch (searchError) { errors.push(`Page ${pageNum}: Search failed - ${searchError}`); } } return { matches: results, errors, pagesScanned, completed: true, stoppedReason: 'completed', }; } /** * Parse PDF destination object to get page number */ function parseDestination(dest, pdfDoc) { try { if (!dest || !Array.isArray(dest) || dest.length === 0) { return undefined; } // First element should be a page reference const pageRef = dest[0]; if (!pageRef || typeof pageRef !== 'object') { return undefined; } // Get page index and convert to 1-based page number const pageIndex = pdfDoc._pagePromises.findIndex((p) => p && p._pageInfo && p._pageInfo.ref === pageRef); return pageIndex >= 0 ? pageIndex + 1 : undefined; } catch (error) { log('warn', 'Failed to parse PDF destination', { error }); return undefined; } } /** * Process outline items recursively */ function processOutlineItems(items, level = 0, maxDepth, pdfDoc, includeDestinations = true) { if (!items || items.length === 0) { return []; } if (maxDepth !== undefined && level >= maxDepth) { return []; } const processedItems = []; for (const item of items) { try { const outlineItem = { title: item.title || '', level, bold: item.bold || false, italic: item.italic || false, }; // Add color if present if (item.color && Array.isArray(item.color) && item.color.length === 3) { outlineItem.color = item.color; } // Add URL if present if (item.url) { outlineItem.url = item.url; } // Parse destination to page number if requested if (includeDestinations && item.dest && pdfDoc) { const pageNum = parseDestination(item.dest, pdfDoc); if (pageNum !== undefined) { outlineItem.page = pageNum; } if (item.dest) { outlineItem.destination = JSON.stringify(item.dest); } } // Process children recursively if (item.items && item.items.length > 0) { outlineItem.children = processOutlineItems(item.items, level + 1, maxDepth, pdfDoc, includeDestinations); } processedItems.push(outlineItem); } catch (error) { log('warn', `Failed to process outline item: ${item.title}`, { error }); } } return processedItems; } /** * Flatten outline structure to a linear list */ function flattenOutlineItems(items) { const flattened = []; for (const item of items) { // Add current item (without children to avoid recursion) const flatItem = { ...item }; delete flatItem.children; flattened.push(flatItem); // Add children recursively if (item.children && item.children.length > 0) { flattened.push(...flattenOutlineItems(item.children)); } } return flattened; } /** * Calculate outline statistics */ function calculateOutlineStats(items) { let totalItems = 0; let maxDepth = 0; let itemsWithPages = 0; let itemsWithUrls = 0; function countItems(items) { for (const item of items) { totalItems++; maxDepth = Math.max(maxDepth, item.level + 1); if (item.page !== undefined) { itemsWithPages++; } if (item.url) { itemsWithUrls++; } if (item.children && item.children.length > 0) { countItems(item.children); } } } countItems(items); return { total_items: totalItems, max_depth: maxDepth, items_with_pages: itemsWithPages, items_with_urls: itemsWithUrls, }; } /** * Download PDF from URL to PDF agent home directory */ async function downloadPdfFromUrl(url, subfolder = "downloads", filename) { try { log('info', `Starting PDF download from URL: ${url}`); // Ensure PDF agent home directory exists const pdfAgentHome = await ensurePdfAgentHome(); const downloadDir = join(pdfAgentHome, subfolder); // Create download directory if it doesn't exist await mkdir(downloadDir, { recursive: true }); // Generate filename if not provided let finalFilename = filename; if (!finalFilename) { try { const urlObj = new URL(url); finalFilename = basename(urlObj.pathname) || `download_${Date.now()}.pdf`; // Ensure .pdf extension if (!finalFilename.toLowerCase().endsWith('.pdf')) { finalFilename += '.pdf'; } } catch { finalFilename = `download_${Date.now()}.pdf`; } } else { // Ensure .pdf extension for provided filename if (!finalFilename.toLowerCase().endsWith('.pdf')) { finalFilename += '.pdf'; } } const filePath = join(downloadDir, finalFilename); // Check if file already exists if (await fileExists(filePath)) { return { success: false, error: `File already exists at ${filePath}. Please provide a different filename or delete the existing file.` }; } log('info', `Downloading PDF to: ${filePath}`); // Download the file with timeout const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), OPERATION_TIMEOUT); try { const response = await fetch(url, { signal: controller.signal, headers: { 'User-Agent': 'PDF-Agent-MCP/1.0.0' } }); clearTimeout(timeoutId); if (!response.ok) { return { success: false, error: `HTTP ${response.status}: ${response.statusText}` }; } // Check content type const contentType = response.headers.get('content-type') || ''; if (!contentType.includes('application/pdf') && !contentType.includes('application/octet-stream')) { log('warn', `Content-Type is not PDF: ${contentType}`); } // Get content length for size check const contentLength = response.headers.get('content-length'); if (contentLength && parseInt(contentLength) > MAX_FILE_SIZE) { return { success: false, error: `File too large: ${(parseInt(contentLength) / 1024 / 1024).toFixed(1)}MB (max ${MAX_FILE_SIZE / 1024 / 1024}MB)` }; } // Stream the response to file const fileStream = createWriteStream(filePath); if (!response.body) { return { success: false, error: 'Empty response body' }; } await pipeline(response.body, fileStream); // Verify the downloaded file const stats = await stat(filePath); if (stats.size === 0) { return { success: false, error: 'Downloaded file is empty' }; } if (stats.size > MAX_FILE_SIZE) { // Clean up oversized file try { await stat(filePath); await import('fs').then(fs => fs.promises.unlink(filePath)); } catch { } return { success: false, error: `Downloaded file too large: ${(stats.size / 1024 / 1024).toFixed(1)}MB (max ${MAX_FILE_SIZE / 1024 / 1024}MB)` }; } // Try to validate it's a PDF by reading the header try { const buffer = await readFile(filePath, { encoding: null }); if (!buffer.subarray(0, 4).toString('ascii').startsWith('%PDF')) { log('warn', 'Downloaded file does not appear to be a valid PDF (missing PDF header)'); } } catch (error) { log('warn', 'Could not validate PDF header', { error }); } log('info', `PDF downloaded successfully: ${stats.size} bytes`); return { success: true, filePath: filePath, metadata: { filename: finalFilename, subfolder: subfolder, size_bytes: stats.size, size_mb: Number((stats.size / (1024 * 1024)).toFixed(2)), url: url, content_type: contentType, downloaded_at: new Date().toISOString() } }; } catch (error) { clearTimeout(timeoutId); // Clean up partial file on error try { if (await fileExists(filePath)) { await import('fs').then(fs => fs.promises.unlink(filePath)); } } catch { } if (error instanceof Error && error.name === 'AbortError') { return { success: false, error: `Download timeout after ${OPERATION_TIMEOUT / 1000} seconds` }; } return { success: false, error: `Download failed: ${error instanceof Error ? error.message : 'Unknown error'}` }; } } catch (error) { log('error', 'PDF download failed', { error }); return { success: false, error: `Download failed: ${error instanceof Error ? error.message : 'Unknown error'}` }; } } /** * Extract PDF outline/table of contents using PDF.js */ async function extractPdfOutline(pdfBuffer, filePath, options) { try { log('info', `Extracting PDF outline from ${filePath}`); // Load PDF document const pdfDoc = await pdfjsLib.getDocument({ data: new Uint8Array(pdfBuffer) }).promise; // Get outline const outline = await pdfDoc.getOutline(); if (!outline || outline.length === 0) { log('info', 'PDF has no outline/bookmarks'); return { file_path: filePath, has_outline: false, outline_items: [], summary: { total_items: 0, max_depth: 0, items_with_pages: 0, items_with_urls: 0, }, }; } log('info', `Found ${outline.length} top-level outline items`); // Process outline items let processedItems = processOutlineItems(outline, 0, options.maxDepth, pdfDoc, options.includeDestinations); // Flatten structure if requested if (options.flattenStructure) { processedItems = flattenOutlineItems(processedItems); } // Calculate statistics const summary = calculateOutlineStats(processedItems); log('info', `Processed outline: ${summary.total_items} items, max depth ${summary.max_depth}`); return { file_path: filePath, has_outline: true, outline_items: processedItems, summary, }; } catch (error) { log('error', 'Failed to extract PDF outline', { error }); throw new Error(`PDF outline extraction failed: ${error}`); } } /** * Search multiple PDFs with parallelism control */ async function searchMultiplePdfsWithParallelism(files, searchPattern, options) { const results = []; // Process files in batches based on parallelism for (let i = 0; i < files.length; i += options.parallelism) { const batch = files.slice(i, i + options.parallelism); const batchPromises = batch.map(async ({ path, originalPath }) => { try { // Check if file exists if (!(await fileExists(path))) { return { file: originalPath, success: false, error: `File not found: ${path}` }; } // Get PDF metadata const pdfBuffer = await safeReadFile(path); let pdfDoc; try { pdfDoc = await PDFDocument.load(pdfBuffer); } catch (error) { if (error instanceof Error && error.message.includes('encrypted')) { pdfDoc = await PDFDocument.load(pdfBuffer, { ignoreEncryption: true }); } else { throw error; } } const totalPages = pdfDoc.getPageCount(); // Parse page range const pageNumbers = parsePageRange(options.pageRange, totalPages); // Determine search strategy const hasLimits = options.maxResultsPerFile !== undefined || options.maxPagesScannedPerFile !== undefined; let searchResult; if (hasLimits) { searchResult = await searchPdfPageByPage(path, pageNumbers, searchPattern, options.contextChars, options.searchTimeout, options.maxResultsPerFile, options.maxPagesScannedPerFile); } else { const comprehensiveResult = await searchPdfComprehensive(path, pageNumbers, searchPattern, options.contextChars, options.searchTimeout); searchResult = { ...comprehensiveResult, completed: true, stoppedReason: 'completed' }; } // Calculate total matches for this file const totalMatches = searchResult.matches.reduce((sum, page) => sum + page.matchCount, 0); return { file: originalPath, success: true, result: { total_pages: totalPages, pages_in_range: pageNumbers.length, total_matches: totalMatches, pages_with_matches: searchResult.matches.length, pages_scanned: searchResult.pagesScanned, completed: searchResult.completed, stopped_reason: searchResult.stoppedReason, matches: searchResult.matches, errors: searchResult.errors } }; } catch (error) { log('error', `Error searching PDF ${originalPath}`, { error }); return { file: originalPath, success: false, error: error instanceof Error ? error.message : String(error) }; } }); // Wait for batch to complete const batchResults = await Promise.allSettled(batchPromises); // Process batch results batchResults.forEach((result, index) => { if (result.status === 'fulfilled') { results.push(result.value); } else { results.push({ file: batch[index].originalPath, success: false, error: `Unexpected error: ${result.reason}` }); } }); } return results; } // Create the MCP server const server = new Server({ name: "pdf-agent-mcp", version: "1.0.0", }, { capabilities: { tools: {}, }, }); // Enhanced file reading with size and timeout protection async function safeReadFile(filePath, maxSize = MAX_FILE_SIZE) { return new Promise(async (resolve, reject) => { const timeout = setTimeout(() => { reject(new Error(`File read timeout after ${OPERATION_TIMEOUT / 1000} seconds. The file may be too large or the system may be under heavy load. Try again or check file size.`)); }, OPERATION_TIMEOUT); try { const stats = await stat(filePath); if (stats.size > maxSize) { clearTimeout(timeout); reject(new Error(`File too large: ${(stats.size / 1024 / 1024).toFixed(1)}MB (max ${maxSize / 1024 / 1024}MB). Please reduce file size or use a smaller PDF.`)); return; } const content = await readFile(filePath); clearTimeout(timeout); resolve(content); } catch (error) { clearTimeout(timeout); if (error instanceof Error && error.code === 'ENOENT') { reject(new Error(`File not found: ${filePath}. Please check the file path and ensure the file exists.`)); } else if (error instanceof Error && error.code === 'EACCES') { reject(new Error(`Permission denied: ${filePath}. Please check file permissions and ensure you have read access.`)); } else { reject(error); } } }); } // Tool handlers server.setRequestHandler(ListToolsRequestSchema, async () => { return { tools: [ { name: "get_pdf_metadata", description: "Extract metadata and basic information from a PDF file, including page count, file size, creation dates, and document properties. Use either absolute_path for any location or relative_path for files in ~/pdf-agent/ directory.", inputSchema: { type: "object", properties: { absolute_path: { type: "string", description: "Absolute path to the PDF file (e.g., '/Users/john/documents/report.pdf')", }, relative_path: { type: "string", description: "Path relative to ~/pdf-agent/ directory (e.g., 'reports/annual.pdf')", }, use_pdf_home: { type: "boolean", description: "Use PDF agent home directory for relative paths (default: true)", default: true, }, }, }, }, { name: "get_pdf_text", description: "Extract text from specific pages or page ranges of a PDF file using native text extraction. Supports Python-style slicing: '5' (single page), '5:10' (range), '7:' (from page 7 to end), ':5' (from start to page 5). Use either absolute_path for any location or relative_path for files in ~/pdf-agent/ directory. Note: Works best with PDFs containing native text; scanned PDFs may yield limited results.", inputSchema: { type: "object", properties: { absolute_path: { type: "string", description: "Absolute path to the PDF file (e.g., '/Users/john/documents/report.pdf')", }, relative_path: { type: "string", description: "Path relative to ~/pdf-agent/ directory (e.g., 'reports/annual.pdf')", }, use_pdf_home: { type: "boolean", description: "Use PDF agent home directory for relative paths (default: true)", default: true, }, page_range: { type: "string", description: "Page range in enhanced Python-style format: '5' (page 5), '5:10' (pages 5-10), '7:' (page 7 to end), ':5' (start to page 5). Also supports comma-separated combinations: '1,3:5,7' (pages 1, 3-5, and 7), '1-3,7,10:' (pages 1-3, 7, and 10 to end). Default: '1:' (all pages)", default: "1:", }, extraction_strategy: { type: "string", description: "Text extraction strategy: 'hybrid' (enhanced native extraction with better error handling), 'native' (standard PDF.js extraction). Default: 'hybrid'", enum: ["hybrid", "native"], default: "hybrid", }, preserve_formatting: { type: "boolean", description: "Preserve text formatting and spacing (default: true)", default: true, }, line_breaks: { type: "boolean", description: "Preserve line breaks in extracted text (default: true)", default: true, }, }, }, }, { name: "get_pdf_images", description: "Extract specific pages or page ranges from a PDF as images for visual analysis. Essential for understanding charts, diagrams, tables, figures, mathematical equations, handwritten content, or any visual elements that text extraction cannot capture. Use when you need to see the actual layout, formatting, or visual content. Supports Python-style slicing: '5' (single page), '5:10' (range), '7:' (from page 7 to end), ':5' (from start to page 5). Returns images as base64-encoded data in MCP image format. Use either absolute_path for any location or relative_path for files in ~/pdf-agent/ directory.", inputSchema: { type: "object", properties: { absolute_path: { type: "string", description: "Absolute path to the PDF file (e.g., '/Users/john/documents/report.pdf')", }, relative_path: { type: "string", description: "Path relative to ~/pdf-agent/ directory (e.g., 'reports/annual.pdf')", }, use_pdf_home: { type: "boolean", description: "Use PDF agent home directory for relative paths (default: true)", default: true, }, page_range: { type: "string", description: "Page range in enhanced Python-style format: '5' (page 5), '5:10' (pages 5-10), '7:' (page 7 to end), ':5' (start to page 5). Also supports comma-separated combinations: '1,3:5,7' (pages 1, 3-5, and 7), '1-3,7,10:' (pages 1-3, 7, and 10 to end). Default: '1:' (all pages)", default: "1:", }, format: { type: "string", description: "Image format: 'jpeg' (smaller file size) or 'png' (higher quality). Default: 'jpeg'", enum: ["jpeg", "png"], default: "jpeg", }, quality: { type: "number", description: "JPEG quality (1-100) - only applies to JPEG format. Higher = better quality but larger size. Default: 85", minimum: 1, maximum: 100, default: 85, }, max_width: { type: "number", description: "Maximum image width in pixels (100-3000). Images will be resized proportionally if larger. Optional.", minimum: 100, maximum: 3000, }, max_height: { type: "number", description: "Maximum image height in pixels (100-3000). Images will be resized proportionally if larger. Optional.", minimum: 100, maximum: 3000, }, }, }, }, { name: "search_pdf", description: "Search for text patterns (including regex) within a PDF file and return matching pages with context snippets. Supports Python-style page ranges and early stopping for performance. Use /pattern/flags format for regex (e.g., '/budget|forecast/gi') or plain text for literal search.", inputSchema: { type: "object", properties: { absolute_path: { type: "string", description: "Absolute path to the PDF file (e.g., '/Users/john/documents/report.pdf')", }, relative_path: { type: "string", description: "Path relative to ~/pdf-agent/ directory (e.g., 'reports/annual.pdf')", }, use_pdf_home: { type: "boolean", description: "Use PDF agent home directory for relative paths (default: true)", default: true, }, page_range: { type: "string", description: "Page range in enhanced Python-style format: '5' (page 5), '5:10' (pages 5-10), '7:' (page 7 to end), ':5' (start to page 5). Also supports comma-separated combinations: '1,3:5,7' (pages 1, 3-5, and 7), '1-3,7,10:' (pages 1-3, 7, and 10 to end). Default: '1:' (all pages)", default: "1:", }, search_pattern: { type: "string", description: "Search pattern: '/regex/flags' format (e.g., '/budget|forecast/gi') or plain text for literal search. Required.", }, max_results: { type: "number", description: "Stop after finding this many total matches. Optional - use for quick searches.", minimum: 1, }, max_pages_scanned: { type: "number", description: "Stop after scanning this many pages. Optional - use for quick searches.", minimum: 1, }, context_chars: { type: "number", description: "Number of characters to include before/after each match for context. Default: 150", minimum: 10, maximum: 1000, default: 150, }, search_timeout: { type: "number", description: "Timeout for search operations in milliseconds. Default: 10000 (10 seconds)", minimum: 1000, maximum: 60000, default: 10000, }, }, }, }, { name: "get_pdf_outline", description: "Extract the table of contents (TOC) or outline/bookmarks structure from a PDF file. Returns hierarchical or flattened list of document sections with titles, page references, and navigation structure. Use either absolute_path for any location or relative_path for files in ~/pdf-agent/ directory.", inputSchema: { type: "object", properties: { absolute_path: { type: "string", description: "Absolute path to the PDF file (e.g., '/Users/john/documents/report.pdf')", }, relative_path: { type: "string", description: "Path relative to ~/pdf-agent/ directory (e.g., 'reports/annual.pdf')", }, use_pdf_home: { type: "boolean", description: "Use PDF agent home directory for relative paths (default: true)", default: true, }, include_destinations: { type: "boolean", description: "Resolve internal destinations to page numbers when possible (default: true)", default: true, }, max_depth: { type: "number", description: "Maximum nesting depth to process (1-10). Optional - limits deep hierarchies", minimum: 1, maximum: 10, }, flatten_structure: { type: "boolean", description: "Return flat list instead of hierarchical tree structure (default: false)", default: false, }, }, }, }, { name: "download_pdf", description: "Download a PDF from a URL and save it to the PDF agent home directory. Downloads to a specified subfolder (default: 'downloads') and returns the full path of the downloaded PDF.", inputSchema: { type: "object", properties: { url: { type: "string", format: "uri", description: "The URL of the PDF to download. Must be a valid HTTP/HTTPS URL.", }, subfolder: { type: "string", description: "Subfolder within ~/pdf-agent/ to save the PDF (default: 'downloads'). Will be created if it doesn't exist.", default: "downloads", }, filename: { type: "string", description: "Optional filename for the downloaded PDF. If not provided, will be derived from URL. Extension .pdf will be added if missing.", }, }, required: ["url"], }, }, { name: "search_multiple_pdfs", description: "Search for text patterns across multiple PDF files in parallel. Processes files concurrently based on the parallelism factor for optimal performance. Increase parallelism (max: 50) to search more files simultaneously and reduce total search time. For large batches of files, prefer a single call with high parallelism rather than multiple smaller calls (e.g., search 100 files with parallelism=50 in one call instead of multiple calls with 20 files each). Returns matches and errors for each file separately.", inputSchema: { type: "object", properties: { files: { type: "array", description: "Array of PDF files to search. Each file must specify either absolute_path or relative_path.", items: { type: "object", properties: { absolute_path: { type: "string", description: "Absolute path to the PDF file" }, relative_path: { type: "string", description: "Path relative to ~/pdf-agent/ directory" }, use_pdf_home: { type: "boolean", description: "Use PDF agent home directory for relative paths (default: true)", default: true } } }, minItems: 1 }, search_pattern: { type: "string", description: "Search pattern: '/regex/flags' format or plain text. Applied to all files." }, parallelism: { type: "number", description: "Number of files to process concurrently. Higher values = faster search. Default: 4, Max: 50", minimum: 1, maximum: 50, default: 4 }, page_range: { type: "string", description: "Page range to search in each file. Default: '1:' (all pages)", default: "1:" }, max_results_per_file: { type: "number", description: "Max matches per file before stopping. Optional.", minimum: 1 }, max_pages_scanned_per_file: { type: "number", description: "Max pages to scan per file. Optional.", minimum: 1 }, context_chars: { type: "number", description: "Characters of context around matches. Default: 150", minimum: 10, maximum: 1000, default: 150 }, search_timeout: { type: "number", description: "Timeout per file in milliseconds. Default: 10000", minimum: 1000, maximum: 60000, default: 10000 } }, required: ["files", "search_pattern"] } }, ], }; }); server.setRequestHandler(CallToolRequestSchema, async (request) => { const { name, arguments: args } = request.params; try { switch (name) { case "get_pdf_metadata": { const { absolute_path, relative_path, use_pdf_home } = GetPdfMetadataSchema.parse(args); try { // Resolve the final path based on parameters let resolvedPath; if (use_pdf_home && relative_path) { // Use relative path from PDF agent home directory const pdfAgentHome = await ensurePdfAgentHome(); resolvedPath = join(pdfAgentHome, relative_path); } else if (absolute_path) { // Use absolute path directly if (!isAbsolute(absolute_path)) { return { content: [ { type: "text", text: JSON.stringify({ error: `Path '${absolute_path}' is not absolute. Use relative_path parameter for relative paths or provide a full absolute path.` }), }, ], }; } resolvedPath = absolute_path; } else { return { content: [ { type: "text", text: JSON.stringify({ error: `Must provide either 'absolute_path' or 'relative_path'. Examples: {"absolute_path": "/Users/john/document.pdf"} or {"relative_path": "reports/annual.pdf"}` }), }, ], }; } if (!(await fileExists(resolvedPath))) { const pathType = relative_path ? 'relative path' : 'absolute path'; const homeInfo = relative_path ? ` (resolved from ~/pdf-agent/ to ${resolvedPath})` : ''; return { content: [ { type: "text", text: JSON.stringify({ error: `PDF file not found at ${pathType}: ${relative_path || absolute_path}${homeInfo}. Please check the file path and ensure the file exists.` }), }, ], }; } // Read the PDF file const pdfBuffer = await safeReadFile(resolvedPath); // Get file stats const stats = await stat(resolvedPath); // Parse PDF to get metadata and page count // Try loading with encryption ignored first for encrypted PDFs let pdfDoc; try { pdfDoc = await PDFDocument.load(pdfBuffer); } catch (error) { if (error instanceof Error && error.message.includes('encrypted')) { pdfDoc = await PDFDocument.load(pdfBuffer, { ignoreEncryption: true }); } else { throw error; } } // Get page count const pageCount = pdfDoc.getPageCount(); // Extract metadata from PDF with error handling const title = pdfDoc.getTitle(); const author = pdfDoc.getAuthor(); const subject = pdfDoc.getSubject(); const creator = pdfDoc.getCreator(); const producer = pdfDoc.getProducer(); // Handle potentially corrupted dates let creationDate = null; let modificationDate = null; try { creationDate = pdfDoc.getCreationDate() || null; } catch (e) { // Ignore corrupted creation date } try { modificationDate = pdfDoc.getModificationDate() || null; } catch (e) { // Ignore corrupted modification date } return { content: [ { type: "text", text: JSON.stringify({ file_path: resolvedPath, pages: pageCount, file_size_bytes: stats.size, file_size_mb: Number((stats.size / (1024 * 1024)).toFixed(2)), created_date: stats.birthtime?.toISOString() || null, modified_date: stats.mtime?.toISOString() || null, title: title || null, author: author || null, subject: subject || null, creator: creator || null, producer: producer || null, creation_date: creationDate?.toISOString() || null, modification_date: modificationDate?.toISOString() || null, encrypted: false, // We handle encrypted PDFs by ignoring encryption }), }, ], }; } catch (e) { const providedPath = relative_path || absolute_path || 'unknown'; const pathType = relative_path ? 'relative path' : 'absolute path'; return { content: [ { type: "text", text: JSON.stringify({ error: `Error processing PDF at ${pathType} '${providedPath}': ${e}. Please ensure the file is a valid PDF and not corrupted.` }), }, ], }; } } case "get_pdf_text": { const { absolute_path, relative_path, use_pdf_home, page_range, extraction_strategy, preserve_formatting, line_breaks } = GetPdfTextSchema.parse(args); try { // Resolve the final path based on parameters (same logic as metadata tool) let resolvedPath; if (use_pdf_home && relative_path) { // Use relative path from PDF agent home directory const pdfAgentHome = await ensurePdfAgentHome(); resolvedPath = join(pdfAgentHome, relative_path); } else if (absolute_path) { // Use absolute path directly if (!isAbsolute(absolute_path)) { return { content: [ { type: "text", text: JSON.stringify({ error: `Path '${absolute_path}' is not absolute. Use relative_path parameter for relative paths or provide a full absolute path.` }), }, ], }; } resolvedPath = absolute_path; } else { return { content: [ { type: "text", text: JSON.stringify({ error: `Must provide either 'absolute_path' or 'relative_path'. Examples: {"absolute_path": "/Users/john/document.pdf"} or {"relative_path": "reports/annual.pdf"}` }), }, ], }; } if (!(await fileExists(resolvedPath))) { const pathType = relative_path ? 'relative path' : 'absolute path'; const homeInfo = relative_path ? ` (resolved from ~/pdf-agent/ to ${resolvedPath})` : ''; return { content: [ { type: "text", text: JSON.stringify({ error: `PDF file not found at ${pathType}: ${relative_path || absolute_path}${homeInfo}. Please check the file path and ensure the file exists.` }), }, ], }; } // Read the PDF file const pdfBuffer = await safeReadFile(resolvedPath); // Get PDF document to determine total pages let pdfDoc; try { pdfDoc = await PDFDocument.load(pdfBuffer); } catch (error) { if (error instanceof Error && error.message.includes('encrypted')) { pdfDoc = await PDFDocument.load(pdfBuffer, { ignoreEncryption: true }); } else { throw error; } } const totalPages = pdfDoc.getPageCount(); // Parse page range const pageNumbers = parsePageRange(page_range, totalPages); log('info', `Extracting text from ${pageNumbers.length} pages using ${extraction_strategy} strategy`, { pages: pageNumbers, strategy: extraction_strategy }); // Extract text based on strategy let extractedTexts; switch (extraction_strategy) { case "native": extractedTexts = await extractTextNative(pdfBuffer, pageNumbers); break; case "hybrid": default: extractedTexts = await extractTextHybrid(pdfBuffer, resolvedPath, pageNumbers); break; } // Format the results const results = pageNumbers.map((pageNum, index) => ({ page: pageNum, text: extractedTexts[index] || '', word_count: (extractedTexts[index] || '').split(/\s+/).filter(word => word.length > 0).length, char_count: (extractedTexts[index] || '').length })); return { content: [ { type: "text", text: JSON.stringify({ file_path: resolvedPath, total_pages: totalPages, extracted_pages: pageNumbers.length, page_range: page_range, extraction_strategy: extraction_strategy, results: results, summary: { total_text_length: results.reduce((sum, r) => sum + r.char_count, 0), total_word_count: results.reduce((sum, r) => sum + r.word_count, 0), pages_with_text: results.filter(r => r.text.trim().length > 0).length } }), }, ], }; } catch (e) { const providedPath = relative_path || absolute_path || 'unknown'; const pathType = relative_path ? 'relative path' : 'absolute path'; return { content: [ { type: "text", text: JSON.stringify({ error: `Error extracting text from PDF at ${pathType} '${providedPath}': ${e}. Please ensure the file is a valid PDF and check the page range format.` }), }, ], }; } } case "get_pdf_images": { const { absolute_path, relative_path, use_pdf_home, page_range, format, quality, max_width, max_height } = GetPdfImagesSchema.parse(args); try { // Resolve the final path based on parameters (same logic as other tools) let resolvedPath; if (use_pdf_home && relative_path) { // Use relative path from PDF agent home directory const pdfAgentHome = await ensurePdfAgentHome(); resolvedPath = join(pdfAgentHome, relative_path); } else if (absolute_path) { // Use absolute path directly if (!isAbsolute(absolute_path)) { return { content: [ { type: "text", text: JSON.stringify({ error: `Path '${absolute_path}' is not absolute. Use relative_path parameter for relative paths or provide a full absolute path.` }), }, ], }; } resolvedPath = absolute_path; } else { return { content: [ { type: "text", text: JSON.stringify({ error: `Must provide either 'absolute_path' or 'relative_path'. Examples: {"absolute_path": "/Users/john/document.pdf"} or {"relative_path": "reports/annual.pdf"}` }), }, ], }; } if (!(await fileExists(resolvedPath))) { const pathType = relative_path ? 'relative path' : 'absolute path'; const homeInfo = relative_path ? ` (resolved from ~/pdf-agent/ to ${resolvedPath})` : ''; return { content: [ { type: "text", text: JSON.stringify({ error: `PDF file not found at ${pathType}: ${relative_path || absolute_path}${homeInfo}. Please check the file path and ensure the file exists.` }), }, ], }; } // Read the PDF file to get total pages const pdfBuffer = await safeReadFile(resolvedPath); // Get PDF document to determine total pages let pdfDoc; try { pdfDoc = await PDFDocument.load(pdfBuffer); } catch (error) { if (error instanceof Error && error.message.includes('encrypted')) { pdfDoc = await PDFDocument.load(pdfBuffer, { ignoreEncryption: true }); } else { throw error; } } const totalPages = pdfDoc.getPageCount(); // Parse page range const pageNumbers = parsePageRange(page_range, totalPages); log('info', `Extracting images from ${pageNumbers.length} pages in ${format} format`, { pages: pageNumbers, format, quality, maxDimensions: { maxWidth: max_width, maxHeight: max_height } }); // Extract images const imageResults = await extractPdfImages(resolvedPath, pageNumbers, { format, quality, maxWidth: max_width, maxHeight: max_height }); // Prepare MCP response with mixed content (text summary + images) const content = []; // Add summary as text const summary = { file_path: resolvedPath, total_pages: totalPages, extracted_pages: pageNumbers.length, page_range: page_range, format: format, quality: quality, max_dimensions: { width: max_width || "original", height: max_height || "original" }, summary: { successful_extractions: imageResults.filter(r => r.image !== null).length, failed_extractions: imageResults.filter(r => r.error).length, total_size_mb: imageResults .filter(r => r.metadata?.processed?.size) .reduce((sum, r) => sum + (r.metadata.processed.size / (1024 * 1024)), 0) .toFixed(2) } }; content.push({ type: "text", text: JSON.stringify(summary, null, 2) }); // Add each successfully extracted image for (const result of imageResults) { if (result.image) { content.push(result.image); } else if (result.error) { content.push({ type: "text", text: JSON.stringify({ page: result.page, error: result.error }) }); } } return { content: content }; } catch (e) { const providedPath = relative_path || absolute_path || 'unknown'; const pathType = relative_path ? 'relative path' : 'absolute path'; return { content: [ { type: "text", text: JSON.stringify({ error: `Error extracting images from PDF at ${pathType} '${providedPath}': ${e}. Please ensure the file is a valid PDF and check the page range format.` }), }, ], }; } } case "search_pdf": { const { absolute_path, relative_path, use_pdf_home, page_range, search_pattern, max_results, max_pages_scanned, context_chars, search_timeout } = SearchPdfSchema.parse(args); try { // Resolve the final path based on parameters let resolvedPath; if (absolute_path) { resolvedPath = resolve(absolute_path); } else { if (use_pdf_home) { const pdfAgentHome = await ensurePdfAgentHome(); resolvedPath = resolve(pdfAgentHome, relative_path); } else { resolvedPath = resolve(relative_path); } } // Check if file exists if (!(await fileExists(resolvedPath))) { throw new Error(`PDF file not found at ${resolvedPath}. Please check the file path and ensure the file exists.`); } // Get PDF metadata to determine page count const pdfBuffer = await safeReadFile(resolvedPath); const pdfDoc = await PDFDocument.load(pdfBuffer); const totalPages = pdfDoc.getPageCount(); // Parse page range const pageNumbers = parsePageRange(page_range, totalPages); // Validate search pattern let searchRegex; let isRegexSearch; try { const parsed = parseSearchPattern(search_pattern); searchRegex = parsed.regex; isRegexSearch = parsed.isRegex; } catch (regexError) { throw new Error(`Invalid search pattern: ${regexError}`); } // Determine search strategy based on limits const hasLimits = max_results !== undefined || max_pages_scanned !== undefined; let searchResults; let searchStrategy; if (hasLimits) { // Use page-by-page search with early stopping searchStrategy = "page_by_page"; searchResults = await searchPdfPageByPage(resolvedPath, pageNumbers, search_pattern, context_chars, search_timeout, max_results, max_pages_scanned); } else { // Use comprehensive search (extract all then search) searchStrategy = "extract_all"; const comprehensiveResults = await searchPdfComprehensive(resolvedPath, pageNumbers, search_pattern, context_chars, search_timeout); searchResults = { ...comprehensiveResults, completed: true, stoppedReason: 'completed' }; } // Create comprehensive summary const totalMatches = searchResults.matches.reduce((sum, page) => sum + page.matchCount, 0); const pagesWithMatches = searchResults.matches.length; const summary = { total_matches: totalMatches, pages_with_matches: pagesWithMatches, pages_scanned: searchResults.pagesScanned, total_pages_in_range: pageNumbers.length, search_strategy: searchStrategy, search_pattern: search_pattern, is_regex: isRegexSearch, completed: searchResults.completed, stopped_reason: searchResults.stoppedReason, context_chars: context_chars, timeout_ms: search_timeout, errors: searchResults.errors?.length || 0 }; // Prepare response content const content = []; // Add summary as first item content.push({ type: "text", text: JSON.stringify(summary, null, 2) }); // Add detailed results if matches found if (searchResults.matches.length > 0) { content.push({ type: "text", text: JSON.stringify({ matches: searchResults.matches, errors: searchResults.errors || [] }, null, 2) }); } // Add error details if any if (searchResults.errors && searchResults.errors.length > 0) { content.push({ type: "text", text: JSON.stringify({ errors: searchResults.errors }, null, 2) }); } return { content: content }; } catch (e) { const providedPath = relative_path || absolute_path || 'unknown'; const pathType = relative_path ? 'relative path' : 'absolute path'; return { content: [ { type: "text", text: JSON.stringify({ error: `Error searching PDF at ${pathType} '${providedPath}': ${e}. Please ensure the file is a valid PDF, check the search pattern format, and verify the page range.` }), }, ], }; } } case "get_pdf_outline": { const { absolute_path, relative_path, use_pdf_home, include_destinations, max_depth, flatten_structure } = GetPdfOutlineSchema.parse(args); try { // Resolve the final path based on parameters let resolvedPath; if (use_pdf_home && relative_path) { // Use relative path from PDF agent home directory const pdfAgentHome = await ensurePdfAgentHome(); resolvedPath = join(pdfAgentHome, relative_path); } else if (absolute_path) { // Use absolute path directly if (!isAbsolute(absolute_path)) { return { content: [ { type: "text", text: JSON.stringify({ error: `Path '${absolute_path}' is not absolute. Use relative_path parameter for relative paths or provide a full absolute path.` }), }, ], }; } resolvedPath = absolute_path; } else { return { content: [ { type: "text", text: JSON.stringify({ error: `Must provide either 'absolute_path' or 'relative_path'. Examples: {"absolute_path": "/Users/john/document.pdf"} or {"relative_path": "reports/annual.pdf"}` }), }, ], }; } if (!(await fileExists(resolvedPath))) { const pathType = relative_path ? 'relative path' : 'absolute path'; const homeInfo = relative_path ? ` (resolved from ~/pdf-agent/ to ${resolvedPath})` : ''; return { content: [ { type: "text", text: JSON.stringify({ error: `PDF file not found at ${pathType} '${relative_path || absolute_path}'${homeInfo}. Please check the file path and ensure the file exists.` }), }, ], }; } // Read PDF file const pdfBuffer = await safeReadFile(resolvedPath); // Extract PDF outline const outlineResult = await extractPdfOutline(pdfBuffer, resolvedPath, { includeDestinations: include_destinations, maxDepth: max_depth, flattenStructure: flatten_structure, }); return { content: [ { type: "text", text: JSON.stringify(outlineResult, null, 2), }, ], }; } catch (e) { const providedPath = relative_path || absolute_path || 'unknown'; const pathType = relative_path ? 'relative path' : 'absolute path'; return { content: [ { type: "text", text: JSON.stringify({ error: `Error extracting PDF outline at ${pathType} '${providedPath}': ${e}. Please ensure the file is a valid PDF and check the file path.` }), }, ], }; } } case "download_pdf": { const { url, subfolder, filename } = DownloadPdfSchema.parse(args); try { const result = await downloadPdfFromUrl(url, subfolder, filename); if (result.success && result.filePath) { return { content: [ { type: "text", text: JSON.stringify({ success: true, file_path: result.filePath, metadata: result.metadata }, null, 2), }, ], }; } else { return { content: [ { type: "text", text: JSON.stringify({ success: false, error: result.error }), }, ], }; } } catch (e) { return { content: [ { type: "text", text: JSON.stringify({ success: false, error: `Download failed: ${e instanceof Error ? e.message : 'Unknown error'}` }), }, ], }; } } case "search_multiple_pdfs": { // Handle case where files might be passed as JSON string let processedArgs = { ...args }; if (args && typeof args.files === 'string') { try { processedArgs.files = JSON.parse(args.files); } catch (e) { throw new Error(`Invalid JSON in files parameter: ${e}`); } } const { files, search_pattern, parallelism, page_range, max_results_per_file, max_pages_scanned_per_file, context_chars, search_timeout } = SearchMultiplePdfsSchema.parse(processedArgs); try { // Resolve all file paths const resolvedFiles = await Promise.all(files.map(async (file) => { let resolvedPath; let originalPath; if (file.use_pdf_home && file.relative_path) { const pdfAgentHome = await ensurePdfAgentHome(); resolvedPath = join(pdfAgentHome, file.relative_path); originalPath = file.relative_path; } else if (file.absolute_path) { if (!isAbsolute(file.absolute_path)) { throw new Error(`Path '${file.absolute_path}' is not absolute`); } resolvedPath = file.absolute_path; originalPath = file.absolute_path; } else { throw new Error('Invalid file specification'); } return { path: resolvedPath, originalPath }; })); log('info', `Starting parallel search across ${files.length} PDFs with parallelism ${parallelism}`); // Perform parallel search const searchResults = await searchMultiplePdfsWithParallelism(resolvedFiles, search_pattern, { parallelism, pageRange: page_range, maxResultsPerFile: max_results_per_file, maxPagesScannedPerFile: max_pages_scanned_per_file, contextChars: context_chars, searchTimeout: search_timeout }); // Calculate summary statistics const successfulSearches = searchResults.filter(r => r.success); const failedSearches = searchResults.filter(r => !r.success); const totalMatches = successfulSearches.reduce((sum, r) => { if (r.result?.total_matches) { return sum + r.result.total_matches; } return sum; }, 0); const totalPagesScanned = successfulSearches.reduce((sum, r) => { if (r.result?.pages_scanned) { return sum + r.result.pages_scanned; } return sum; }, 0); const summary = { files_searched: files.length, successful_searches: successfulSearches.length, failed_searches: failedSearches.length, total_matches_found: totalMatches, total_pages_scanned: totalPagesScanned, search_pattern: search_pattern, parallelism_used: parallelism, page_range: page_range }; log('info', `Search completed: ${totalMatches} matches found across ${successfulSearches.length} files`); return { content: [ { type: "text", text: JSON.stringify({ summary, results: searchResults }, null, 2) } ] }; } catch (e) { log('error', 'Error in search_multiple_pdfs', { error: e }); return { content: [ { type: "text", text: JSON.stringify({ error: `Error searching multiple PDFs: ${e}` }) } ] }; } } default: throw new Error(`Unknown tool: ${name}`); } } catch (error) { return { content: [ { type: "text", text: `Error: ${error}`, }, ], isError: true, }; } }); async function main() { try { log('info', 'Starting PDF Agent MCP Server', { version: '1.0.0', nodeVersion: process.version, platform: process.platform }); const transport = new StdioServerTransport(); await server.connect(transport); log('info', 'PDF Agent MCP Server connected successfully'); } catch (error) { log('error', 'Failed to start server', { error: error instanceof Error ? error.message : error }); throw error; } } // Enhanced error handling with graceful shutdown process.on('SIGINT', () => { log('info', 'Received SIGINT, shutting down gracefully'); process.exit(0); }); process.on('SIGTERM', () => { log('info', 'Received SIGTERM, shutting down gracefully'); process.exit(0); }); process.on('uncaughtException', (error) => { log('error', 'Uncaught exception', { error: error.message, stack: error.stack }); process.exit(1); }); process.on('unhandledRejection', (reason) => { log('error', 'Unhandled rejection', { reason }); process.exit(1); }); main().catch((error) => { log('error', 'Server startup failed', { error: error instanceof Error ? error.message : error }); process.exit(1); }); //# sourceMappingURL=index.js.map

Loading blob content...

Latest Blog Posts

Don't Use Large Strings as Cache Keys
By punkpeye on January 11, 2026.
markdown
node-js
cache
What are Claude Skills?
By punkpeye on January 10, 2026.
mcp
skills
How to Test MCP Streamable HTTP Endpoints Using cURL
By punkpeye on January 2, 2026.
tutorial
bash

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vlad-ds/pdf-agent-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.js•99.8 KiB