import { z } from 'zod';
import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
import { OPS } from 'pdfjs-dist/legacy/build/pdf.mjs';
import fs from 'node:fs/promises';
import path from 'node:path';
import { resolvePath, PROJECT_ROOT } from '../utils/pathUtils.js';
import { McpError, ErrorCode } from '@modelcontextprotocol/sdk/types.js';
// Helper to parse page range strings (e.g., "1-3,5,7-")
// Helper to parse a single range part (e.g., "1-3", "5", "7-")
const parseRangePart = (part, pages) => {
const trimmedPart = part.trim();
if (trimmedPart.includes('-')) {
const [startStr, endStr] = trimmedPart.split('-');
if (startStr === undefined) {
// Basic check
throw new Error(`Invalid page range format: ${trimmedPart}`);
}
const start = parseInt(startStr, 10);
const end = endStr === '' || endStr === undefined ? Infinity : parseInt(endStr, 10);
if (isNaN(start) || isNaN(end) || start <= 0 || start > end) {
throw new Error(`Invalid page range values: ${trimmedPart}`);
}
// Add a reasonable upper limit to prevent infinite loops for open ranges
const practicalEnd = Math.min(end, start + 10000); // Limit range parsing depth
for (let i = start; i <= practicalEnd; i++) {
pages.add(i);
}
if (end === Infinity && practicalEnd === start + 10000) {
console.warn(
`[PDF Reader MCP] Open-ended range starting at ${String(start)} was truncated at page ${String(practicalEnd)} during parsing.`
);
}
} else {
const page = parseInt(trimmedPart, 10);
if (isNaN(page) || page <= 0) {
throw new Error(`Invalid page number: ${trimmedPart}`);
}
pages.add(page);
}
};
// Parses the complete page range string (e.g., "1-3,5,7-")
const parsePageRanges = (ranges) => {
const pages = new Set();
const parts = ranges.split(',');
for (const part of parts) {
parseRangePart(part, pages); // Delegate parsing of each part
}
if (pages.size === 0) {
throw new Error('Page range string resulted in zero valid pages.');
}
return Array.from(pages).sort((a, b) => a - b);
};
// --- Zod Schemas ---
const pageSpecifierSchema = z.union([
z.array(z.number().int().positive()).min(1), // Array of positive integers
z
.string()
.min(1)
.refine((val) => /^[0-9,-]+$/.test(val.replace(/\s/g, '')), {
// Allow spaces but test without them
message: 'Page string must contain only numbers, commas, and hyphens.',
}),
]);
const PdfSourceSchema = z
.object({
path: z.string().min(1).optional().describe('Relative path to the local PDF file.'),
url: z.url().optional().describe('URL of the PDF file.'),
pages: pageSpecifierSchema
.optional()
.describe(
"Extract text only from specific pages (1-based) or ranges for *this specific source*. If provided, 'include_full_text' for the entire request is ignored for this source."
),
})
.strict()
.refine((data) => !!(data.path && !data.url) || !!(!data.path && data.url), {
// Use boolean coercion instead of || for truthiness check if needed, though refine expects boolean
message: "Each source must have either 'path' or 'url', but not both.",
});
const ReadPdfArgsSchema = z
.object({
sources: z
.array(PdfSourceSchema)
.min(1)
.describe('An array of PDF sources to process, each can optionally specify pages.'),
include_full_text: z
.boolean()
.optional()
.default(false)
.describe(
"Include the full text content of each PDF (only if 'pages' is not specified for that source)."
),
include_metadata: z
.boolean()
.optional()
.default(true)
.describe('Include metadata and info objects for each PDF.'),
include_page_count: z
.boolean()
.optional()
.default(true)
.describe('Include the total number of pages for each PDF.'),
include_images: z
.boolean()
.optional()
.default(false)
.describe('Extract embedded images from PDF pages and return them as base64-encoded data.'),
})
.strict();
// --- Helper Functions ---
// Parses the page specification for a single source
const getTargetPages = (sourcePages, sourceDescription) => {
if (!sourcePages) {
return undefined;
}
try {
let targetPages;
if (typeof sourcePages === 'string') {
targetPages = parsePageRanges(sourcePages);
} else {
// Ensure array elements are positive integers
if (sourcePages.some((p) => !Number.isInteger(p) || p <= 0)) {
throw new Error('Page numbers in array must be positive integers.');
}
targetPages = [...new Set(sourcePages)].sort((a, b) => a - b);
}
if (targetPages.length === 0) {
// Check after potential Set deduplication
throw new Error('Page specification resulted in an empty set of pages.');
}
return targetPages;
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
// Throw McpError for invalid page specs caught during parsing
throw new McpError(
ErrorCode.InvalidParams,
`Invalid page specification for source ${sourceDescription}: ${message}`
);
}
};
// Loads the PDF document from path or URL
const loadPdfDocument = async (
source, // Explicitly allow undefined
sourceDescription
) => {
let pdfDataSource;
try {
if (source.path) {
const safePath = resolvePath(source.path); // resolvePath handles security checks
const fileBuffer = await fs.readFile(safePath);
pdfDataSource = {
data: new Uint8Array(fileBuffer.buffer, fileBuffer.byteOffset, fileBuffer.byteLength),
};
} else if (source.url) {
pdfDataSource = { url: source.url };
} else {
// This case should be caught by Zod, but added for robustness
throw new McpError(
ErrorCode.InvalidParams,
`Source ${sourceDescription} missing 'path' or 'url'.`
);
}
} catch (err) {
// Handle errors during path resolution or file reading
let errorMessage; // Declare errorMessage here
const message = err instanceof Error ? err.message : String(err);
const errorCode = ErrorCode.InvalidRequest; // Default error code
if (
typeof err === 'object' &&
err !== null &&
'code' in err &&
err.code === 'ENOENT' &&
source.path
) {
// Specific handling for file not found
// Use imported PROJECT_ROOT for error message
const resolvedPath = path.resolve(PROJECT_ROOT, source.path);
errorMessage =
`File not found at '${source.path}'. Resolved to: '${resolvedPath}'. Project root: '${PROJECT_ROOT}'. ` +
`(Tip: Set PDF_READER_MCP_ROOT environment variable to specify the correct project root directory.)`;
// Optionally keep errorCode as InvalidRequest or change if needed
} else {
// Generic error for other file prep issues or resolvePath errors
errorMessage = `Failed to prepare PDF source ${sourceDescription}. Reason: ${message}`;
}
throw new McpError(errorCode, errorMessage, { cause: err instanceof Error ? err : undefined });
}
const loadingTask = pdfjsLib.getDocument(pdfDataSource);
try {
return await loadingTask.promise;
} catch (err) {
console.error(`[PDF Reader MCP] PDF.js loading error for ${sourceDescription}:`, err);
const message = err instanceof Error ? err.message : String(err);
// Use ?? for default message
throw new McpError(
ErrorCode.InvalidRequest,
`Failed to load PDF document from ${sourceDescription}. Reason: ${message || 'Unknown loading error'}`, // Revert to || as message is likely always string here
{ cause: err instanceof Error ? err : undefined }
);
}
};
// Extracts metadata and page count
const extractMetadataAndPageCount = async (pdfDocument, includeMetadata, includePageCount) => {
const output = {};
if (includePageCount) {
output.num_pages = pdfDocument.numPages;
}
if (includeMetadata) {
try {
const pdfMetadata = await pdfDocument.getMetadata();
const infoData = pdfMetadata.info;
if (infoData !== undefined) {
output.info = infoData;
}
const metadataObj = pdfMetadata.metadata;
const metadataData = metadataObj.getAll();
if (metadataData !== undefined) {
output.metadata = metadataData;
}
} catch (metaError) {
console.warn(
`[PDF Reader MCP] Error extracting metadata: ${metaError instanceof Error ? metaError.message : String(metaError)}`
);
// Optionally add a warning to the result if metadata extraction fails partially
}
}
return output;
};
// Extracts text from specified pages with Y-coordinate based ordering
const extractPageTexts = async (pdfDocument, pagesToProcess, sourceDescription) => {
const extractedPageTexts = [];
for (const pageNum of pagesToProcess) {
let pageText = '';
try {
const page = await pdfDocument.getPage(pageNum);
const textContent = await page.getTextContent();
// Sort text items by Y-coordinate (top to bottom) for proper content ordering
const sortedItems = textContent.items
.map((item) => {
const textItem = item;
return {
str: textItem.str,
y: textItem.transform?.[5] ?? 0, // Y coordinate, default to 0 if not available
};
})
.sort((a, b) => b.y - a.y); // Sort descending (top to bottom in PDF coordinates)
pageText = sortedItems.map((item) => item.str).join('');
} catch (pageError) {
const message = pageError instanceof Error ? pageError.message : String(pageError);
console.warn(
`[PDF Reader MCP] Error getting text content for page ${String(pageNum)} in ${sourceDescription}: ${message}` // Explicit string conversion
);
pageText = `Error processing page: ${message}`; // Include error in text
}
extractedPageTexts.push({ page: pageNum, text: pageText });
}
// Sorting is likely unnecessary if pagesToProcess was sorted, but keep for safety
extractedPageTexts.sort((a, b) => a.page - b.page);
return extractedPageTexts;
};
// Extract images from a single page
const extractImagesFromPage = async (page, pageNum) => {
const images = [];
try {
const operatorList = await page.getOperatorList();
// Find all image painting operations
const imageIndices = [];
for (let i = 0; i < operatorList.fnArray.length; i++) {
const op = operatorList.fnArray[i];
if (op === OPS.paintImageXObject || op === OPS.paintXObject) {
imageIndices.push(i);
}
}
// Extract each image using Promise-based approach
const imagePromises = imageIndices.map(
(imgIndex, arrayIndex) =>
new Promise((resolve) => {
const argsArray = operatorList.argsArray[imgIndex];
if (!argsArray || argsArray.length === 0) {
resolve(null);
return;
}
const imageName = argsArray[0];
// Use callback-based get() as images may not be resolved yet
page.objs.get(imageName, (imageData) => {
if (!imageData || typeof imageData !== 'object') {
resolve(null);
return;
}
const img = imageData;
if (!img.data || !img.width || !img.height) {
resolve(null);
return;
}
// Determine image format based on kind
// kind === 1 = grayscale, 2 = RGB, 3 = RGBA
const format = img.kind === 1 ? 'grayscale' : img.kind === 3 ? 'rgba' : 'rgb';
// Convert Uint8Array to base64
const base64 = Buffer.from(img.data).toString('base64');
resolve({
page: pageNum,
index: arrayIndex,
width: img.width,
height: img.height,
format,
data: base64,
});
});
})
);
const resolvedImages = await Promise.all(imagePromises);
images.push(...resolvedImages.filter((img) => img !== null));
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn(
`[PDF Reader MCP] Error extracting images from page ${String(pageNum)}: ${message}`
);
}
return images;
};
// Extract images from specified pages
const extractImages = async (pdfDocument, pagesToProcess) => {
const allImages = [];
// Process pages sequentially to avoid overwhelming PDF.js
for (const pageNum of pagesToProcess) {
try {
const page = await pdfDocument.getPage(pageNum);
const pageImages = await extractImagesFromPage(page, pageNum);
allImages.push(...pageImages);
} catch (error) {
const message = error instanceof Error ? error.message : String(error);
console.warn(
`[PDF Reader MCP] Error getting page ${String(pageNum)} for image extraction: ${message}`
);
}
}
return allImages;
};
// Determines the actual list of pages to process based on target pages and total pages
const determinePagesToProcess = (targetPages, totalPages, includeFullText) => {
let pagesToProcess = [];
let invalidPages = [];
if (targetPages) {
// Filter target pages based on actual total pages
pagesToProcess = targetPages.filter((p) => p <= totalPages);
invalidPages = targetPages.filter((p) => p > totalPages);
} else if (includeFullText) {
// If no specific pages requested for this source, use global flag
pagesToProcess = Array.from({ length: totalPages }, (_, i) => i + 1);
}
return { pagesToProcess, invalidPages };
};
// Processes a single PDF source
const processSingleSource = async (
source,
globalIncludeFullText,
globalIncludeMetadata,
globalIncludePageCount,
globalIncludeImages
) => {
const sourceDescription = source.path ?? source.url ?? 'unknown source';
let individualResult = { source: sourceDescription, success: false };
try {
// 1. Parse target pages for this source (throws McpError on invalid spec)
const targetPages = getTargetPages(source.pages, sourceDescription);
// 2. Load PDF Document (throws McpError on loading failure)
// Destructure to remove 'pages' before passing to loadPdfDocument due to exactOptionalPropertyTypes
const { pages: _pages, ...loadArgs } = source;
const pdfDocument = await loadPdfDocument(loadArgs, sourceDescription);
const totalPages = pdfDocument.numPages;
// 3. Extract Metadata & Page Count
const metadataOutput = await extractMetadataAndPageCount(
pdfDocument,
globalIncludeMetadata,
globalIncludePageCount
);
const output = { ...metadataOutput }; // Start building output
// 4. Determine actual pages to process
const { pagesToProcess, invalidPages } = determinePagesToProcess(
targetPages,
totalPages,
globalIncludeFullText // Pass the global flag
);
// Add warnings for invalid requested pages
if (invalidPages.length > 0) {
output.warnings = output.warnings ?? [];
output.warnings.push(
`Requested page numbers ${invalidPages.join(', ')} exceed total pages (${String(totalPages)}).`
);
}
// 5. Extract Text (if needed)
if (pagesToProcess.length > 0) {
const extractedPageTexts = await extractPageTexts(
pdfDocument,
pagesToProcess,
sourceDescription
);
if (targetPages) {
// If specific pages were requested for *this source*
output.page_texts = extractedPageTexts;
} else {
// Only assign full_text if pages were NOT specified for this source
output.full_text = extractedPageTexts.map((p) => p.text).join('\n\n');
}
}
// 6. Extract Images (if needed)
if (globalIncludeImages && pagesToProcess.length > 0) {
const extractedImages = await extractImages(pdfDocument, pagesToProcess);
if (extractedImages.length > 0) {
output.images = extractedImages;
}
}
individualResult = { ...individualResult, data: output, success: true };
} catch (error) {
let errorMessage = `Failed to process PDF from ${sourceDescription}.`;
if (error instanceof McpError) {
errorMessage = error.message; // Use message from McpError directly
} else if (error instanceof Error) {
errorMessage += ` Reason: ${error.message}`;
} else {
errorMessage += ` Unknown error: ${JSON.stringify(error)}`;
}
individualResult.error = errorMessage;
individualResult.success = false;
delete individualResult.data; // Ensure no data on error
}
return individualResult;
};
// --- Main Handler Function ---
export const handleReadPdfFunc = async (args) => {
let parsedArgs;
try {
parsedArgs = ReadPdfArgsSchema.parse(args);
} catch (error) {
if (error instanceof z.ZodError) {
throw new McpError(
ErrorCode.InvalidParams,
`Invalid arguments: ${error.issues.map((e) => `${e.path.join('.')} (${e.message})`).join(', ')}`
);
}
// Added fallback for non-Zod errors during parsing
const message = error instanceof Error ? error.message : String(error);
throw new McpError(ErrorCode.InvalidParams, `Argument validation failed: ${message}`);
}
const { sources, include_full_text, include_metadata, include_page_count, include_images } =
parsedArgs;
// Process all sources concurrently
const results = await Promise.all(
sources.map((source) =>
processSingleSource(
source,
include_full_text,
include_metadata,
include_page_count,
include_images
)
)
);
return {
content: [
{
type: 'text',
text: JSON.stringify({ results }, null, 2),
},
],
};
};
// Export the consolidated ToolDefinition
export const readPdfToolDefinition = {
name: 'read_pdf',
description:
'Reads content/metadata from PDFs. Paths must be relative to the project root. ' +
'Project root can be set via: 1) --root command line argument, 2) PDF_READER_MCP_ROOT environment variable, or 3) process.cwd() (default). ' +
'Absolute paths in the path parameter are rejected.',
schema: ReadPdfArgsSchema,
handler: handleReadPdfFunc,
};