PageIndex MCP

process-document.ts•10.9 KiB

import fs from 'node:fs/promises'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js'; import open from 'open'; import pRetry, { AbortError } from 'p-retry'; import { z } from 'zod'; import type { PageIndexMcpClient } from '../client/mcp-client.js'; import { createErrorResponse } from '../result.js'; import type { ToolDefinition } from './types.js'; // Schema for process document parameters - accepts both URLs and local file paths const processDocumentSchema = z.object({ url: z.string().describe('URL to a PDF document or local file path'), }); type ProcessDocumentParams = z.infer<typeof processDocumentSchema>; interface FileInfo { name: string; size: number; mimeType: string; buffer: Buffer; } /** * Simplified process_document tool that handles only PDF files * Supports both URLs and local file paths with inline file handling */ async function processDocument( params: ProcessDocumentParams, mcpClient: PageIndexMcpClient, ): Promise<CallToolResult> { const { url: rawUrl } = params; const url = rawUrl.trim(); try { const isLocal = !url.startsWith('http://') && !url.startsWith('https://'); const fileInfo: FileInfo = isLocal ? await readLocalPdf(url) : await downloadPdf(url); const signedUrlResult = await mcpClient.callTool('get_signed_upload_url', { fileName: fileInfo.name, fileType: fileInfo.mimeType, }); if (!signedUrlResult.content?.[0]?.text) { throw new Error('Failed to get signed upload URL from remote server'); } const uploadInfo = JSON.parse(signedUrlResult.content[0].text as string); if (!uploadInfo.upload_url) { throw new Error('No upload URL received from server'); } const uploadResponse = await fetch(uploadInfo.upload_url, { method: 'PUT', body: fileInfo.buffer, headers: { 'Content-Type': fileInfo.mimeType, }, signal: AbortSignal.timeout(2 * 60000), }); if (uploadResponse.status !== 200) { throw new Error( `Document upload failed with status ${uploadResponse.status}`, ); } const submitResult = await mcpClient.callTool('submit_document', { file_name: uploadInfo.file_name, }); if (submitResult.isError) { const [content] = submitResult.content || []; const result = JSON.parse((content?.text as string) || '{}'); if (result.open_url) { open(result.open_url); } } return submitResult; } catch (error) { // Handle PDF validation errors with specific guidance if ( error instanceof Error && error.message?.includes('Not a valid PDF file') ) { return createErrorResponse( error.message, {}, { next_steps: { immediate: 'The document is not a valid PDF.', options: [ 'Verify the URL points directly to a PDF document, not a webpage', 'Check if the URL requires authentication or specific headers', 'Try accessing the URL in a browser to see what content it returns', ], auto_retry: 'You can retry with a valid PDF URL', }, }, ); } // Handle arxiv-specific retry failures if (error instanceof Error && error.name === 'ArxivRetryFailed') { return createErrorResponse( error.message, {}, { next_steps: { immediate: 'Failed to retrieve PDF from arXiv. Both original URL and .pdf suffix were tried.', options: [ 'Verify the arXiv paper ID is correct (format: YYMM.NNNNN)', 'Try the direct PDF URL: https://arxiv.org/pdf/PAPER_ID.pdf', 'Check if the paper exists by visiting https://arxiv.org/abs/PAPER_ID', ], auto_retry: 'You can retry with the correct arXiv URL format', }, }, ); } return createErrorResponse( error instanceof Error ? error.message : 'Unknown error occurred', {}, { next_steps: { immediate: 'PDF processing failed. Please check the document/URL and try again.', options: [ 'Ensure the document is a valid PDF', 'Check document size is under 100MB', 'Verify the URL is accessible (for remote documents)', 'Try with a different PDF document', ], auto_retry: 'You can retry with the same document, or try a different one', }, }, ); } } /** * Read a local PDF file */ async function readLocalPdf(filePath: string): Promise<FileInfo> { let resolvedPath = filePath; if (filePath.startsWith('file://')) { resolvedPath = fileURLToPath(filePath); } if (!path.isAbsolute(resolvedPath)) { resolvedPath = path.resolve(process.cwd(), resolvedPath); } const stats = await fs.stat(resolvedPath); if (!stats.isFile()) { throw new Error(`Path is not a file: ${resolvedPath}`); } const maxSize = 100 * 1024 * 1024; if (stats.size > maxSize) { throw new Error( `Document too large: ${stats.size} bytes (max: ${maxSize} bytes)`, ); } const fileName = path.basename(resolvedPath); if (!fileName.toLowerCase().endsWith('.pdf')) { throw new Error(`Document must be a PDF: ${fileName}`); } const buffer = await fs.readFile(resolvedPath); // Validate PDF magic bytes if (!buffer.subarray(0, 4).equals(Buffer.from('%PDF'))) { throw new Error(`Not a valid PDF document: ${fileName}`); } return { name: fileName, size: buffer.length, mimeType: 'application/pdf', buffer, }; } /** * Download a PDF from a remote URL with arXiv compatibility and validation */ async function downloadPdf(url: string): Promise<FileInfo> { return pRetry( async () => { const fetchWithRetry = async (fetchUrl: string): Promise<Response> => { const response = await fetch(fetchUrl, { signal: AbortSignal.timeout(120000), // 2 minute timeout headers: { Accept: 'application/pdf, application/octet-stream, */*', 'User-Agent': 'Mozilla/5.0 (compatible; PDF-Processor/1.0)', }, }); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } return response; }; let response: Response; try { response = await fetchWithRetry(url); } catch (error: any) { // For arxiv.org URLs, try adding .pdf suffix if original request failed if (url.includes('arxiv.org') && !url.endsWith('.pdf')) { console.error( `Initial request failed for arxiv URL: ${url}, retrying with .pdf suffix\n`, ); const retryUrl = url.endsWith('/') ? `${url}pdf` : `${url}.pdf`; try { response = await fetchWithRetry(retryUrl); console.error( `Successfully retrieved PDF from retry URL: ${retryUrl}\n`, ); } catch (retryError: any) { console.error( `Retry with .pdf suffix also failed: ${retryError.message}\n`, ); const enhancedError = new AbortError( `Failed to retrieve PDF from ${url}. Tried both original URL and ${retryUrl}`, ); Object.defineProperty(enhancedError, 'name', { value: 'ArxivRetryFailed', configurable: true, }); throw enhancedError; } } else { throw error; } } // Check content length const contentLength = response.headers.get('content-length'); if (contentLength && parseInt(contentLength, 10) > 100 * 1024 * 1024) { throw new Error( `Document too large: ${contentLength} bytes (max: 100MB)`, ); } // Extract filename from URL or Content-Disposition header let filename = path.basename(new URL(url).pathname); const contentDisposition = response.headers.get('content-disposition'); if (contentDisposition) { const filenameMatch = contentDisposition.match( /filename[^;=\n]*=((['"]).*?\2|[^;\n]*)/, ); if (filenameMatch?.[1]) { filename = filenameMatch[1].replace(/['"]/g, ''); } } // Ensure filename has .pdf extension if not present if (!filename.toLowerCase().endsWith('.pdf')) { filename = `${filename}.pdf`; } const contentType = response.headers.get('content-type'); const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); // Validate PDF magic bytes if (!buffer.subarray(0, 4).equals(Buffer.from('%PDF'))) { throw new AbortError( `Not a valid PDF document. Got content-type: ${contentType}, filename: ${filename}`, ); } // Additional content-type validation (more lenient after magic byte check) if ( contentType && !contentType.includes('pdf') && !contentType.includes('octet-stream') && !contentType.includes('application/pdf') ) { console.error( `Unexpected content-type: ${contentType}, but PDF magic bytes validated\n`, ); } return { name: filename, buffer, size: buffer.length, mimeType: 'application/pdf', }; }, { retries: 3, factor: 2, minTimeout: 2000, maxTimeout: 10000, onFailedAttempt: (error) => { // Don't retry on client errors (4xx) except 429 (rate limiting) if (error instanceof Error && error.message.includes('HTTP ')) { const statusMatch = error.message.match(/HTTP (\d+)/); const status = statusMatch ? parseInt(statusMatch[1], 10) : undefined; if (status && status >= 400 && status < 500 && status !== 429) { throw new AbortError( status === 404 ? 'PDF not found at the provided URL' : status === 403 ? 'Access denied - URL requires authentication or is blocked' : error.message, ); } } // Don't retry ArxivRetryFailed errors if ((error as any).name === 'ArxivRetryFailed') { throw error; } console.error( `PDF download attempt ${error.attemptNumber} failed. ${error.retriesLeft} retries left. URL: ${url}\n`, ); }, }, ); } export const processDocumentTool: ToolDefinition = { name: 'process_document', description: 'Upload and process PDF documents from URLs or local files. Supports OCR processing, hierarchical content extraction, and intelligent document analysis. Returns a unique doc_id for subsequent operations. Processing typically takes 0-3 minutes depending on document size (estimate: 2 seconds per page). Supports files up to 100MB.', inputSchema: processDocumentSchema, handler: processDocument, };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/VectifyAI/pageindex-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

process-document.ts•10.9 KiB