PDF Reader MCP Server

readPdf.js•17.9 KiB

import { z } from 'zod'; import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs'; import { OPS } from 'pdfjs-dist/legacy/build/pdf.mjs'; import fs from 'node:fs/promises'; import path from 'node:path'; import { resolvePath, PROJECT_ROOT } from '../utils/pathUtils.js'; import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js'; // Helper to parse page range strings (e.g., "1-3,5,7-") // Helper to parse a single range part (e.g., "1-3", "5", "7-") const parseRangePart = (part, pages) => { const trimmedPart = part.trim(); if (trimmedPart.includes('-')) { const [startStr, endStr] = trimmedPart.split('-'); if (startStr === undefined) { // Basic check throw new Error(`Invalid page range format: ${trimmedPart}`); } const start = parseInt(startStr, 10); const end = endStr === '' || endStr === undefined ? Infinity : parseInt(endStr, 10); if (isNaN(start) || isNaN(end) || start <= 0 || start > end) { throw new Error(`Invalid page range values: ${trimmedPart}`); } // Add a reasonable upper limit to prevent infinite loops for open ranges const practicalEnd = Math.min(end, start + 10000); // Limit range parsing depth for (let i = start; i <= practicalEnd; i++) { pages.add(i); } if (end === Infinity && practicalEnd === start + 10000) { console.warn( `[PDF Reader MCP] Open-ended range starting at ${String(start)} was truncated at page ${String(practicalEnd)} during parsing.` ); } } else { const page = parseInt(trimmedPart, 10); if (isNaN(page) || page <= 0) { throw new Error(`Invalid page number: ${trimmedPart}`); } pages.add(page); } }; // Parses the complete page range string (e.g., "1-3,5,7-") const parsePageRanges = (ranges) => { const pages = new Set(); const parts = ranges.split(','); for (const part of parts) { parseRangePart(part, pages); // Delegate parsing of each part } if (pages.size === 0) { throw new Error('Page range string resulted in zero valid pages.'); } return Array.from(pages).sort((a, b) => a - b); }; // --- Zod Schemas --- const pageSpecifierSchema = z.union([ z.array(z.number().int().positive()).min(1), // Array of positive integers z .string() .min(1) .refine((val) => /^[0-9,-]+$/.test(val.replace(/\s/g, '')), { // Allow spaces but test without them message: 'Page string must contain only numbers, commas, and hyphens.', }), ]); const PdfSourceSchema = z .object({ path: z.string().min(1).optional().describe('Relative path to the local PDF file.'), url: z.url().optional().describe('URL of the PDF file.'), pages: pageSpecifierSchema .optional() .describe( "Extract text only from specific pages (1-based) or ranges for *this specific source*. If provided, 'include_full_text' for the entire request is ignored for this source." ), }) .strict() .refine((data) => !!(data.path && !data.url) || !!(!data.path && data.url), { // Use boolean coercion instead of || for truthiness check if needed, though refine expects boolean message: "Each source must have either 'path' or 'url', but not both.", }); const ReadPdfArgsSchema = z .object({ sources: z .array(PdfSourceSchema) .min(1) .describe('An array of PDF sources to process, each can optionally specify pages.'), include_full_text: z .boolean() .optional() .default(false) .describe( "Include the full text content of each PDF (only if 'pages' is not specified for that source)." ), include_metadata: z .boolean() .optional() .default(true) .describe('Include metadata and info objects for each PDF.'), include_page_count: z .boolean() .optional() .default(true) .describe('Include the total number of pages for each PDF.'), include_images: z .boolean() .optional() .default(false) .describe('Extract embedded images from PDF pages and return them as base64-encoded data.'), }) .strict(); // --- Helper Functions --- // Parses the page specification for a single source const getTargetPages = (sourcePages, sourceDescription) => { if (!sourcePages) { return undefined; } try { let targetPages; if (typeof sourcePages === 'string') { targetPages = parsePageRanges(sourcePages); } else { // Ensure array elements are positive integers if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) { throw new Error('Page numbers in array must be positive integers.'); } targetPages = [...new Set(sourcePages)].sort((a, b) => a - b); } if (targetPages.length === 0) { // Check after potential Set deduplication throw new Error('Page specification resulted in an empty set of pages.'); } return targetPages; } catch (error) { const message = error instanceof Error ? error.message : String(error); // Throw McpError for invalid page specs caught during parsing throw new McpError( ErrorCode.InvalidParams, `Invalid page specification for source ${sourceDescription}: ${message}` ); } }; // Loads the PDF document from path or URL const loadPdfDocument = async ( source, // Explicitly allow undefined sourceDescription ) => { let pdfDataSource; try { if (source.path) { const safePath = resolvePath(source.path); // resolvePath handles security checks const fileBuffer = await fs.readFile(safePath); pdfDataSource = { data: new Uint8Array(fileBuffer.buffer, fileBuffer.byteOffset, fileBuffer.byteLength), }; } else if (source.url) { pdfDataSource = { url: source.url }; } else { // This case should be caught by Zod, but added for robustness throw new McpError( ErrorCode.InvalidParams, `Source ${sourceDescription} missing 'path' or 'url'.` ); } } catch (err) { // Handle errors during path resolution or file reading let errorMessage; // Declare errorMessage here const message = err instanceof Error ? err.message : String(err); const errorCode = ErrorCode.InvalidRequest; // Default error code if ( typeof err === 'object' && err !== null && 'code' in err && err.code === 'ENOENT' && source.path ) { // Specific handling for file not found // Use imported PROJECT_ROOT for error message const resolvedPath = path.resolve(PROJECT_ROOT, source.path); errorMessage = `File not found at '${source.path}'. Resolved to: '${resolvedPath}'. Project root: '${PROJECT_ROOT}'. ` + `(Tip: Set PDF_READER_MCP_ROOT environment variable to specify the correct project root directory.)`; // Optionally keep errorCode as InvalidRequest or change if needed } else { // Generic error for other file prep issues or resolvePath errors errorMessage = `Failed to prepare PDF source ${sourceDescription}. Reason: ${message}`; } throw new McpError(errorCode, errorMessage, { cause: err instanceof Error ? err : undefined }); } const loadingTask = pdfjsLib.getDocument(pdfDataSource); try { return await loadingTask.promise; } catch (err) { console.error(`[PDF Reader MCP] PDF.js loading error for ${sourceDescription}:`, err); const message = err instanceof Error ? err.message : String(err); // Use ?? for default message throw new McpError( ErrorCode.InvalidRequest, `Failed to load PDF document from ${sourceDescription}. Reason: ${message || 'Unknown loading error'}`, // Revert to || as message is likely always string here { cause: err instanceof Error ? err : undefined } ); } }; // Extracts metadata and page count const extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => { const output = {}; if (includePageCount) { output.num_pages = pdfDocument.numPages; } if (includeMetadata) { try { const pdfMetadata = await pdfDocument.getMetadata(); const infoData = pdfMetadata.info; if (infoData !== undefined) { output.info = infoData; } const metadataObj = pdfMetadata.metadata; const metadataData = metadataObj.getAll(); if (metadataData !== undefined) { output.metadata = metadataData; } } catch (metaError) { console.warn( `[PDF Reader MCP] Error extracting metadata: ${metaError instanceof Error ? metaError.message : String(metaError)}` ); // Optionally add a warning to the result if metadata extraction fails partially } } return output; }; // Extracts text from specified pages with Y-coordinate based ordering const extractPageTexts = async (pdfDocument, pagesToProcess, sourceDescription) => { const extractedPageTexts = []; for (const pageNum of pagesToProcess) { let pageText = ''; try { const page = await pdfDocument.getPage(pageNum); const textContent = await page.getTextContent(); // Sort text items by Y-coordinate (top to bottom) for proper content ordering const sortedItems = textContent.items .map((item) => { const textItem = item; return { str: textItem.str, y: textItem.transform?.[5] ?? 0, // Y coordinate, default to 0 if not available }; }) .sort((a, b) => b.y - a.y); // Sort descending (top to bottom in PDF coordinates) pageText = sortedItems.map((item) => item.str).join(''); } catch (pageError) { const message = pageError instanceof Error ? pageError.message : String(pageError); console.warn( `[PDF Reader MCP] Error getting text content for page ${String(pageNum)} in ${sourceDescription}: ${message}` // Explicit string conversion ); pageText = `Error processing page: ${message}`; // Include error in text } extractedPageTexts.push({ page: pageNum, text: pageText }); } // Sorting is likely unnecessary if pagesToProcess was sorted, but keep for safety extractedPageTexts.sort((a, b) => a.page - b.page); return extractedPageTexts; }; // Extract images from a single page const extractImagesFromPage = async (page, pageNum) => { const images = []; try { const operatorList = await page.getOperatorList(); // Find all image painting operations const imageIndices = []; for (let i = 0; i < operatorList.fnArray.length; i++) { const op = operatorList.fnArray[i]; if (op === OPS.paintImageXObject || op === OPS.paintXObject) { imageIndices.push(i); } } // Extract each image using Promise-based approach const imagePromises = imageIndices.map( (imgIndex, arrayIndex) => new Promise((resolve) => { const argsArray = operatorList.argsArray[imgIndex]; if (!argsArray || argsArray.length === 0) { resolve(null); return; } const imageName = argsArray[0]; // Use callback-based get() as images may not be resolved yet page.objs.get(imageName, (imageData) => { if (!imageData || typeof imageData !== 'object') { resolve(null); return; } const img = imageData; if (!img.data || !img.width || !img.height) { resolve(null); return; } // Determine image format based on kind // kind === 1 = grayscale, 2 = RGB, 3 = RGBA const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb'; // Convert Uint8Array to base64 const base64 = Buffer.from(img.data).toString('base64'); resolve({ page: pageNum, index: arrayIndex, width: img.width, height: img.height, format, data: base64, }); }); }) ); const resolvedImages = await Promise.all(imagePromises); images.push(...resolvedImages.filter((img) => img !== null)); } catch (error) { const message = error instanceof Error ? error.message : String(error); console.warn( `[PDF Reader MCP] Error extracting images from page ${String(pageNum)}: ${message}` ); } return images; }; // Extract images from specified pages const extractImages = async (pdfDocument, pagesToProcess) => { const allImages = []; // Process pages sequentially to avoid overwhelming PDF.js for (const pageNum of pagesToProcess) { try { const page = await pdfDocument.getPage(pageNum); const pageImages = await extractImagesFromPage(page, pageNum); allImages.push(...pageImages); } catch (error) { const message = error instanceof Error ? error.message : String(error); console.warn( `[PDF Reader MCP] Error getting page ${String(pageNum)} for image extraction: ${message}` ); } } return allImages; }; // Determines the actual list of pages to process based on target pages and total pages const determinePagesToProcess = (targetPages, totalPages, includeFullText) => { let pagesToProcess = []; let invalidPages = []; if (targetPages) { // Filter target pages based on actual total pages pagesToProcess = targetPages.filter((p) => p <= totalPages); invalidPages = targetPages.filter((p) => p > totalPages); } else if (includeFullText) { // If no specific pages requested for this source, use global flag pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1); } return { pagesToProcess, invalidPages }; }; // Processes a single PDF source const processSingleSource = async ( source, globalIncludeFullText, globalIncludeMetadata, globalIncludePageCount, globalIncludeImages ) => { const sourceDescription = source.path ?? source.url ?? 'unknown source'; let individualResult = { source: sourceDescription, success: false }; try { // 1. Parse target pages for this source (throws McpError on invalid spec) const targetPages = getTargetPages(source.pages, sourceDescription); // 2. Load PDF Document (throws McpError on loading failure) // Destructure to remove 'pages' before passing to loadPdfDocument due to exactOptionalPropertyTypes const { pages: _pages, ...loadArgs } = source; const pdfDocument = await loadPdfDocument(loadArgs, sourceDescription); const totalPages = pdfDocument.numPages; // 3. Extract Metadata & Page Count const metadataOutput = await extractMetadataAndPageCount( pdfDocument, globalIncludeMetadata, globalIncludePageCount ); const output = { ...metadataOutput }; // Start building output // 4. Determine actual pages to process const { pagesToProcess, invalidPages } = determinePagesToProcess( targetPages, totalPages, globalIncludeFullText // Pass the global flag ); // Add warnings for invalid requested pages if (invalidPages.length > 0) { output.warnings = output.warnings ?? []; output.warnings.push( `Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).` ); } // 5. Extract Text (if needed) if (pagesToProcess.length > 0) { const extractedPageTexts = await extractPageTexts( pdfDocument, pagesToProcess, sourceDescription ); if (targetPages) { // If specific pages were requested for *this source* output.page_texts = extractedPageTexts; } else { // Only assign full_text if pages were NOT specified for this source output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n'); } } // 6. Extract Images (if needed) if (globalIncludeImages && pagesToProcess.length > 0) { const extractedImages = await extractImages(pdfDocument, pagesToProcess); if (extractedImages.length > 0) { output.images = extractedImages; } } individualResult = { ...individualResult, data: output, success: true }; } catch (error) { let errorMessage = `Failed to process PDF from ${sourceDescription}.`; if (error instanceof McpError) { errorMessage = error.message; // Use message from McpError directly } else if (error instanceof Error) { errorMessage += ` Reason: ${error.message}`; } else { errorMessage += ` Unknown error: ${JSON.stringify(error)}`; } individualResult.error = errorMessage; individualResult.success = false; delete individualResult.data; // Ensure no data on error } return individualResult; }; // --- Main Handler Function --- export const handleReadPdfFunc = async (args) => { let parsedArgs; try { parsedArgs = ReadPdfArgsSchema.parse(args); } catch (error) { if (error instanceof z.ZodError) { throw new McpError( ErrorCode.InvalidParams, `Invalid arguments: ${error.issues.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}` ); } // Added fallback for non-Zod errors during parsing const message = error instanceof Error ? error.message : String(error); throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`); } const { sources, include_full_text, include_metadata, include_page_count, include_images } = parsedArgs; // Process all sources concurrently const results = await Promise.all( sources.map((source) => processSingleSource( source, include_full_text, include_metadata, include_page_count, include_images ) ) ); return { content: [ { type: 'text', text: JSON.stringify({ results }, null, 2), }, ], }; }; // Export the consolidated ToolDefinition export const readPdfToolDefinition = { name: 'read_pdf', description: 'Reads content/metadata from PDFs. Paths must be relative to the project root. ' + 'Project root can be set via: 1) --root command line argument, 2) PDF_READER_MCP_ROOT environment variable, or 3) process.cwd() (default). ' + 'Absolute paths in the path parameter are rejected.', schema: ReadPdfArgsSchema, handler: handleReadPdfFunc, };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cydynamic/pdf-reader-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

readPdf.js•17.9 KiB