de en es ja ko ru zh

PDF Reader MCP Server

by sylphxltd

TypeScript

MIT License

432

292

Overview InspectNew Endpoints Schema Related Servers Reviews Score

Need Help?View Source Code Report Issue

readPdf.js•7.45 kB

// PDF reading handler - orchestrates PDF processing workflow import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js'; import { z } from 'zod'; import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js'; import { loadPdfDocument } from '../pdf/loader.js'; import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js'; import { readPdfArgsSchema } from '../schemas/readPdf.js'; /** * Process a single PDF source */ const processSingleSource = async (source, options) => { const sourceDescription = source.path ?? source.url ?? 'unknown source'; let individualResult = { source: sourceDescription, success: false }; try { // Parse target pages const targetPages = getTargetPages(source.pages, sourceDescription); // Load PDF document const { pages: _pages, ...loadArgs } = source; const pdfDocument = await loadPdfDocument(loadArgs, sourceDescription); const totalPages = pdfDocument.numPages; // Extract metadata and page count const metadataOutput = await extractMetadataAndPageCount(pdfDocument, options.includeMetadata, options.includePageCount); const output = { ...metadataOutput }; // Determine pages to process const { pagesToProcess, invalidPages } = determinePagesToProcess(targetPages, totalPages, options.includeFullText); // Add warnings for invalid pages const warnings = buildWarnings(invalidPages, totalPages); if (warnings.length > 0) { output.warnings = warnings; } // Extract content with ordering preserved if (pagesToProcess.length > 0) { // Use new extractPageContent to preserve Y-coordinate ordering const pageContents = await Promise.all(pagesToProcess.map((pageNum) => extractPageContent(pdfDocument, pageNum, options.includeImages, sourceDescription))); // Store page contents for ordered retrieval output.page_contents = pageContents.map((items, idx) => ({ page: pagesToProcess[idx], items, })); // For backward compatibility, also provide text-only outputs const extractedPageTexts = pageContents.map((items, idx) => ({ page: pagesToProcess[idx], text: items .filter((item) => item.type === 'text') .map((item) => item.textContent) .join(''), })); if (targetPages) { // Specific pages requested output.page_texts = extractedPageTexts; } else { // Full text requested output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n'); } // Extract image metadata for JSON response if (options.includeImages) { const extractedImages = pageContents .flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData)) .map((item) => item.imageData) .filter((img) => img !== undefined); if (extractedImages.length > 0) { output.images = extractedImages; } } } individualResult = { ...individualResult, data: output, success: true }; } catch (error) { let errorMessage = `Failed to process PDF from ${sourceDescription}.`; if (error instanceof McpError) { errorMessage = error.message; } else if (error instanceof Error) { errorMessage += ` Reason: ${error.message}`; } else { errorMessage += ` Unknown error: ${JSON.stringify(error)}`; } individualResult.error = errorMessage; individualResult.success = false; individualResult.data = undefined; } return individualResult; }; /** * Main handler function for read_pdf tool */ export const handleReadPdfFunc = async (args) => { let parsedArgs; try { parsedArgs = readPdfArgsSchema.parse(args); } catch (error) { if (error instanceof z.ZodError) { throw new McpError(ErrorCode.InvalidParams, `Invalid arguments: ${error.errors.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}`); } const message = error instanceof Error ? error.message : String(error); throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`); } const { sources, include_full_text, include_metadata, include_page_count, include_images } = parsedArgs; // Process all sources concurrently const results = await Promise.all(sources.map((source) => processSingleSource(source, { includeFullText: include_full_text, includeMetadata: include_metadata, includePageCount: include_page_count, includeImages: include_images, }))); // Build content parts - start with structured JSON for backward compatibility const content = []; // Strip image data and page_contents from JSON to keep it manageable const resultsForJson = results.map((result) => { if (result.data) { const { images, page_contents, ...dataWithoutBinaryContent } = result.data; // Include image count and metadata in JSON, but not the base64 data if (images) { const imageInfo = images.map((img) => ({ page: img.page, index: img.index, width: img.width, height: img.height, format: img.format, })); return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } }; } return { ...result, data: dataWithoutBinaryContent }; } return result; }); // First content part: Structured JSON results content.push({ type: 'text', text: JSON.stringify({ results: resultsForJson }, null, 2), }); // Add page content in exact Y-coordinate order for (const result of results) { if (!result.success || !result.data?.page_contents) continue; // Process each page's content items in order for (const pageContent of result.data.page_contents) { for (const item of pageContent.items) { if (item.type === 'text' && item.textContent) { // Add text content part content.push({ type: 'text', text: item.textContent, }); } else if (item.type === 'image' && item.imageData) { // Add image content part (all images are now encoded as PNG) content.push({ type: 'image', data: item.imageData.data, mimeType: 'image/png', }); } } } } return { content }; }; // Export the tool definition export const readPdfToolDefinition = { name: 'read_pdf', description: 'Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.', schema: readPdfArgsSchema, handler: handleReadPdfFunc, };

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/sylphxltd/pdf-reader-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server