import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { z } from "zod";
import fs from "fs/promises";
import path from "path";
import { spawn } from "child_process";
import { fileURLToPath } from "url";
import pdfParse from "pdf-parse"; // Fallback PDF parser
// Initialize server
const server = new McpServer({
name: "pdf-mcp-server",
version: "1.0.0"
});
// Tool Schema
const ReadPdfSchema = z.object({
path: z.string().describe("Absolute path to the PDF file"),
// Optional page range, though our simple python script effectively does full doc for now
// We could pass these to the python script if we enhanced it.
start_page: z.number().optional().describe("Start page (1-based)"),
end_page: z.number().optional().describe("End page (1-based)")
});
// Helper to find python
async function getPythonCommand(): Promise<string> {
const rootDir = path.dirname(path.dirname(fileURLToPath(import.meta.url)));
const venvPython = path.join(rootDir, "venv", "bin", "python");
try {
await fs.access(venvPython);
return venvPython;
} catch {
return "python3"; // Fallback to system python
}
}
// Fallback Javascript PDF extraction
async function extractPdfWithJs(filePath: string, start_page?: number, end_page?: number): Promise<string> {
const dataBuffer = await fs.readFile(filePath);
const render_page = (pageData: any) => {
const pageNum = pageData.pageNumber; // 1-based page number
// Filter based on requested range
if (start_page !== undefined && pageNum < start_page) {
return "";
}
if (end_page !== undefined && pageNum > end_page) {
return "";
}
let render_options = {
normalizeWhitespace: false,
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function (textContent: any) {
let lastY, text = '';
for (let item of textContent.items) {
if (lastY == item.transform[5] || !lastY) {
text += item.str;
}
else {
text += '\n' + item.str;
}
lastY = item.transform[5];
}
return `\n--- Page ${pageNum} ---\n` + text;
});
}
const data = await pdfParse(dataBuffer, {
pagerender: render_page
});
return data.text;
}
// Register Tool
server.registerTool(
"read_pdf",
{
title: "Read PDF Content",
description: "Read and extract text content from a PDF file. Uses a Python backend (marker-pdf) to preserve mathematical notations (LaTeX) and layout structure. Best for scientific papers.",
inputSchema: ReadPdfSchema
},
async ({ path: filePath, start_page, end_page }) => {
try {
// Basic validation
if (!path.isAbsolute(filePath)) {
return {
content: [{ type: "text", text: "Error: Path must be absolute." }],
isError: true
};
}
await fs.access(filePath);
const pythonCmd = await getPythonCommand();
const rootDir = path.dirname(path.dirname(fileURLToPath(import.meta.url)));
const scriptPath = path.join(rootDir, "python", "convert.py");
// Build args
const args = [scriptPath, filePath];
if (start_page !== undefined && end_page !== undefined) {
args.push(start_page.toString(), end_page.toString());
}
// Spawn python process
const pythonProcess = spawn(pythonCmd, args);
let stdoutData = "";
let stderrData = "";
pythonProcess.stdout.on("data", (data) => {
stdoutData += data.toString();
});
pythonProcess.stderr.on("data", (data) => {
stderrData += data.toString();
});
const exitCode = await new Promise<number>((resolve) => {
pythonProcess.on("close", (code) => {
resolve(code ?? 1);
});
});
if (exitCode !== 0) {
console.error(`Python script failed with code ${exitCode}. Falling back to JS parser.`);
// Fallback to JS parser
const jsText = await extractPdfWithJs(filePath, start_page, end_page);
return {
content: [{ type: "text", text: jsText }]
};
}
try {
// Parse JSON output from Python script
const result = JSON.parse(stdoutData);
if (result.error) {
return {
content: [{ type: "text", text: `Error from Python backend: ${result.error}` }],
isError: true
};
}
return {
content: [{
type: "text",
text: result.text
}]
};
} catch (e) {
return {
content: [{ type: "text", text: `Failed to parse Python output: ${e}\nRaw Output: ${stdoutData}\nStderr: ${stderrData}` }],
isError: true
};
}
} catch (error) {
const errorMessage = error instanceof Error ? error.message : String(error);
return {
content: [{ type: "text", text: `Error reading PDF: ${errorMessage}` }],
isError: true
};
}
}
);
// Start server
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.error("PDF MCP Server running on stdio");
}
main().catch((error) => {
console.error("Server error:", error);
process.exit(1);
});