PDF Reader MCP Server

Overview Schema Related Servers Score Discussions

readPdf.ts

readPdf.ts•7.72 KiB

// PDF reading handler - orchestrates PDF processing workflow import { image, text, tool, toolError } from '@sylphx/mcp-server-sdk'; import type * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs'; import { buildWarnings, extractMetadataAndPageCount, extractPageContent, } from '../pdf/extractor.js'; import { loadPdfDocument } from '../pdf/loader.js'; import { determinePagesToProcess, getTargetPages } from '../pdf/parser.js'; import { readPdfArgsSchema } from '../schemas/readPdf.js'; import type { ExtractedImage, PdfResultData, PdfSource, PdfSourceResult } from '../types/pdf.js'; import { createLogger } from '../utils/logger.js'; const logger = createLogger('ReadPdf'); /** * Process a single PDF source */ const processSingleSource = async ( source: PdfSource, options: { includeFullText: boolean; includeMetadata: boolean; includePageCount: boolean; includeImages: boolean; } ): Promise<PdfSourceResult> => { const sourceDescription = source.path ?? source.url ?? 'unknown source'; let individualResult: PdfSourceResult = { source: sourceDescription, success: false }; let pdfDocument: pdfjsLib.PDFDocumentProxy | null = null; try { // Parse target pages const targetPages = getTargetPages(source.pages, sourceDescription); // Load PDF document const { pages: _pages, ...loadArgs } = source; pdfDocument = await loadPdfDocument(loadArgs, sourceDescription); const totalPages = pdfDocument.numPages; // Extract metadata and page count const metadataOutput = await extractMetadataAndPageCount( pdfDocument, options.includeMetadata, options.includePageCount ); const output: PdfResultData = { ...metadataOutput }; // Determine pages to process const { pagesToProcess, invalidPages } = determinePagesToProcess( targetPages, totalPages, options.includeFullText ); // Add warnings for invalid pages const warnings = buildWarnings(invalidPages, totalPages); if (warnings.length > 0) { output.warnings = warnings; } // Extract content with ordering preserved if (pagesToProcess.length > 0) { // Use new extractPageContent to preserve Y-coordinate ordering const pageContents = await Promise.all( pagesToProcess.map((pageNum) => extractPageContent( pdfDocument as pdfjsLib.PDFDocumentProxy, pageNum, options.includeImages, sourceDescription ) ) ); // Store page contents for ordered retrieval output.page_contents = pageContents.map((items, idx) => ({ page: pagesToProcess[idx] as number, items, })); // For backward compatibility, also provide text-only outputs const extractedPageTexts = pageContents.map((items, idx) => ({ page: pagesToProcess[idx] as number, text: items .filter((item) => item.type === 'text') .map((item) => item.textContent) .join(''), })); if (targetPages) { // Specific pages requested output.page_texts = extractedPageTexts; } else { // Full text requested output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n'); } // Extract image metadata for JSON response if (options.includeImages) { const extractedImages = pageContents .flatMap((items) => items.filter((item) => item.type === 'image' && item.imageData)) .map((item) => item.imageData) .filter((img): img is ExtractedImage => img !== undefined); if (extractedImages.length > 0) { output.images = extractedImages; } } } individualResult = { ...individualResult, data: output, success: true }; } catch (error: unknown) { let errorMessage = `Failed to process PDF from ${sourceDescription}.`; if (error instanceof Error) { errorMessage += ` Reason: ${error.message}`; } else { errorMessage += ` Unknown error: ${JSON.stringify(error)}`; } individualResult.error = errorMessage; individualResult.success = false; individualResult.data = undefined; } finally { // Clean up PDF document resources if (pdfDocument && typeof pdfDocument.destroy === 'function') { try { await pdfDocument.destroy(); } catch (destroyError: unknown) { // Log cleanup errors but don't fail the operation const message = destroyError instanceof Error ? destroyError.message : String(destroyError); logger.warn('Error destroying PDF document', { sourceDescription, error: message }); } } } return individualResult; }; // Export the tool definition using builder pattern export const readPdf = tool() .description( 'Reads content/metadata/images from one or more PDFs (local/URL). Each source can specify pages to extract.' ) .input(readPdfArgsSchema) .handler(async ({ input }) => { const { sources, include_full_text, include_metadata, include_page_count, include_images } = input; // Process sources with concurrency limit to prevent memory exhaustion // Processing large PDFs concurrently can consume significant memory const MAX_CONCURRENT_SOURCES = 3; const results: PdfSourceResult[] = []; const options = { includeFullText: include_full_text ?? false, includeMetadata: include_metadata ?? true, includePageCount: include_page_count ?? true, includeImages: include_images ?? false, }; for (let i = 0; i < sources.length; i += MAX_CONCURRENT_SOURCES) { const batch = sources.slice(i, i + MAX_CONCURRENT_SOURCES); const batchResults = await Promise.all( batch.map((source) => processSingleSource(source, options)) ); results.push(...batchResults); } // Check if all sources failed const allFailed = results.every((r) => !r.success); if (allFailed) { const errorMessages = results.map((r) => r.error).join('; '); return toolError(`All PDF sources failed to process: ${errorMessages}`); } // Build content parts - start with structured JSON for backward compatibility const content: Array<ReturnType<typeof text> | ReturnType<typeof image>> = []; // Strip image data and page_contents from JSON to keep it manageable const resultsForJson = results.map((result) => { if (result.data) { const { images, page_contents, ...dataWithoutBinaryContent } = result.data; // Include image count and metadata in JSON, but not the base64 data if (images) { const imageInfo = images.map((img) => ({ page: img.page, index: img.index, width: img.width, height: img.height, format: img.format, })); return { ...result, data: { ...dataWithoutBinaryContent, image_info: imageInfo } }; } return { ...result, data: dataWithoutBinaryContent }; } return result; }); // First content part: Structured JSON results content.push(text(JSON.stringify({ results: resultsForJson }, null, 2))); // Add page content in exact Y-coordinate order for (const result of results) { if (!result.success || !result.data?.page_contents) continue; // Process each page's content items in order for (const pageContent of result.data.page_contents) { for (const item of pageContent.items) { if (item.type === 'text' && item.textContent) { // Add text content part content.push(text(item.textContent)); } else if (item.type === 'image' && item.imageData) { // Add image content part (all images are now encoded as PNG) content.push(image(item.imageData.data, 'image/png')); } } } } return content; });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/SylphxAI/pdf-reader-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

readPdf.ts•7.72 KiB