MinerU MCP Server

index.ts•22.2 KiB

#!/usr/bin/env node import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import axios, { AxiosError } from "axios"; import { readFileSync, createReadStream, createWriteStream, readdirSync, statSync, mkdirSync, writeFileSync, existsSync, unlinkSync, rmSync, realpathSync } from "node:fs"; import { join, basename, extname } from "node:path"; import { execFileSync } from "node:child_process"; import { tmpdir } from "node:os"; import { randomBytes } from "node:crypto"; import { pipeline } from "node:stream/promises"; // Configuration schema for Smithery export const configSchema = z.object({ mineruApiKey: z.string().describe("MinerU API key from mineru.net"), mineruBaseUrl: z .string() .optional() .default("https://mineru.net/api/v4") .describe("API base URL"), mineruDefaultModel: z .enum(["pipeline", "vlm"]) .optional() .default("pipeline") .describe("Default model: pipeline (fast) or vlm (90% accuracy)"), }); type Config = z.infer<typeof configSchema>; // Error codes with actionable messages const ERROR_MESSAGES: Record<string, string> = { A0202: "Token error. Check your API key.", A0211: "Token expired. Get a new API key.", "-60002": "Invalid file format. Use: pdf, doc, docx, ppt, pptx, png, jpg, jpeg", "-60005": "File too large. Max 200MB.", "-60006": "Too many pages. Max 600 per file. Split the document.", "-60008": "URL timeout. Check the URL is accessible.", "-60009": "Queue full. Try again later.", "-60012": "Task not found. Check task_id is valid.", "-60013": "Access denied. You can only access your own tasks.", }; // Response types interface TaskResponse { task_id: string; } interface TaskStatus { task_id: string; data_id?: string; state: "pending" | "running" | "done" | "failed" | "converting"; full_zip_url?: string; err_msg?: string; extract_progress?: { extracted_pages: number; total_pages: number; start_time: string; }; } interface BatchResponse { batch_id: string; } interface BatchFileUploadResponse { batch_id: string; file_urls: string[]; } interface BatchStatus { batch_id: string; extract_result: Array<{ file_name: string; state: string; full_zip_url?: string; err_msg?: string; data_id?: string; extract_progress?: { extracted_pages: number; total_pages: number; start_time: string; }; }>; } // Format helpers function formatConciseStatus(status: TaskStatus): string { const parts = [status.state, status.task_id]; if (status.state === "done" && status.full_zip_url) { parts.push(status.full_zip_url); } else if (status.state === "running" && status.extract_progress) { const p = status.extract_progress; parts.push(`${p.extracted_pages}/${p.total_pages} pages`); } else if (status.state === "failed" && status.err_msg) { parts.push(status.err_msg); } return parts.join(" | "); } function formatDetailedStatus(status: TaskStatus): string { return JSON.stringify(status, null, 2); } function formatConciseBatch(batch: BatchStatus, limit: number, offset: number): string { const results = batch.extract_result.slice(offset, offset + limit); const total = batch.extract_result.length; const done = batch.extract_result.filter((r) => r.state === "done").length; const lines = [`Batch ${batch.batch_id}: ${done}/${total} done`]; for (const r of results) { let line = `- ${r.file_name}: ${r.state}`; if (r.state === "done" && r.full_zip_url) { line += ` ${r.full_zip_url}`; } else if (r.state === "running" && r.extract_progress) { line += ` (${r.extract_progress.extracted_pages}/${r.extract_progress.total_pages})`; } lines.push(line); } if (offset + limit < total) { lines.push(`[+${total - offset - limit} more, use offset=${offset + limit}]`); } return lines.join("\n"); } // Create server function for Smithery export default function createServer({ config }: { config: Config }) { const apiKey = config.mineruApiKey; const baseUrl = config.mineruBaseUrl || "https://mineru.net/api/v4"; const defaultModel = config.mineruDefaultModel || "pipeline"; // API client with injected config async function mineruRequest<T>( endpoint: string, method: "GET" | "POST" = "GET", data?: unknown ): Promise<T> { if (!apiKey) { throw new Error("MINERU_API_KEY not set. Add it to your environment."); } try { const response = await axios({ method, url: `${baseUrl}${endpoint}`, headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiKey}`, }, data, }); const result = response.data; if (result.code !== 0) { const code = String(result.code); const msg = ERROR_MESSAGES[code] || result.msg || "Unknown error"; throw new Error(`MinerU error ${code}: ${msg}`); } return result.data as T; } catch (error) { if (error instanceof AxiosError) { const code = error.response?.data?.code; if (code) { const msg = ERROR_MESSAGES[String(code)] || error.response?.data?.msg; throw new Error(`MinerU error ${code}: ${msg}`); } throw new Error(`HTTP ${error.response?.status}: ${error.message}`); } throw error; } } // Create MCP server const server = new McpServer({ name: "mineru", version: "1.0.2", }); // Tool 1: mineru_parse server.tool( "mineru_parse", "Parse a document URL. Returns task_id to check status.", { url: z.string().describe("Document URL (PDF, DOC, PPT, images)"), model: z .enum(["pipeline", "vlm"]) .optional() .describe("pipeline=fast, vlm=90% accuracy"), pages: z.string().optional().describe("Page range: 1-10,15 or 2--2"), ocr: z.boolean().optional().describe("Enable OCR (pipeline only)"), formula: z.boolean().optional().describe("Formula recognition"), table: z.boolean().optional().describe("Table recognition"), language: z.string().optional().describe("Language code: ch, en, etc"), formats: z .array(z.enum(["docx", "html", "latex"])) .optional() .describe("Extra export formats"), }, async (params) => { const requestData: Record<string, unknown> = { url: params.url, model_version: params.model || defaultModel, }; if (params.pages) requestData.page_ranges = params.pages; if (params.ocr !== undefined) requestData.is_ocr = params.ocr; if (params.formula !== undefined) requestData.enable_formula = params.formula; if (params.table !== undefined) requestData.enable_table = params.table; if (params.language) requestData.language = params.language; if (params.formats?.length) requestData.extra_formats = params.formats; const result = await mineruRequest<TaskResponse>("/extract/task", "POST", requestData); return { content: [ { type: "text", text: `Task created: ${result.task_id}\nUse mineru_status to check progress.`, }, ], }; } ); // Tool 2: mineru_status server.tool( "mineru_status", "Check task progress. Returns download URL when done.", { task_id: z.string().describe("Task ID from mineru_parse"), format: z .enum(["concise", "detailed"]) .optional() .default("concise") .describe("Output format"), }, async (params) => { const status = await mineruRequest<TaskStatus>(`/extract/task/${params.task_id}`); const text = params.format === "detailed" ? formatDetailedStatus(status) : formatConciseStatus(status); return { content: [{ type: "text", text }], }; } ); // Tool 3: mineru_batch server.tool( "mineru_batch", "Parse multiple URLs in one batch (max 200).", { urls: z.array(z.string()).describe("Array of document URLs"), model: z .enum(["pipeline", "vlm"]) .optional() .describe("pipeline=fast, vlm=90% accuracy"), ocr: z.boolean().optional().describe("Enable OCR (pipeline only)"), formula: z.boolean().optional().describe("Formula recognition"), table: z.boolean().optional().describe("Table recognition"), language: z.string().optional().describe("Language code: ch, en, etc"), formats: z .array(z.enum(["docx", "html", "latex"])) .optional() .describe("Extra export formats"), }, async (params) => { if (params.urls.length > 200) { throw new Error("Max 200 URLs per batch. Split into smaller batches."); } const requestData: Record<string, unknown> = { files: params.urls.map((url) => ({ url })), model_version: params.model || defaultModel, }; if (params.ocr !== undefined) requestData.is_ocr = params.ocr; if (params.formula !== undefined) requestData.enable_formula = params.formula; if (params.table !== undefined) requestData.enable_table = params.table; if (params.language) requestData.language = params.language; if (params.formats?.length) requestData.extra_formats = params.formats; const result = await mineruRequest<BatchResponse>("/extract/task/batch", "POST", requestData); return { content: [ { type: "text", text: `Batch created: ${result.batch_id}\n${params.urls.length} files queued.\nUse mineru_batch_status to check progress.`, }, ], }; } ); // Tool 4: mineru_batch_status server.tool( "mineru_batch_status", "Get batch results. Supports pagination for large batches.", { batch_id: z.string().describe("Batch ID from mineru_batch"), limit: z.number().optional().default(10).describe("Max results to return"), offset: z.number().optional().default(0).describe("Skip first N results"), format: z .enum(["concise", "detailed"]) .optional() .default("concise") .describe("Output format"), }, async (params) => { const batch = await mineruRequest<BatchStatus>( `/extract-results/batch/${params.batch_id}` ); const text = params.format === "detailed" ? JSON.stringify(batch, null, 2) : formatConciseBatch(batch, params.limit ?? 10, params.offset ?? 0); return { content: [{ type: "text", text }], }; } ); // Tool 5: mineru_upload_batch server.tool( "mineru_upload_batch", "Upload local files from a directory for batch parsing. Handles file upload to MinerU servers. Returns batch_id to track with mineru_batch_status. Use mineru_download_results to get named markdown files.", { directory: z.string().optional().describe("Directory path containing PDF/DOC/PPT files"), files: z.array(z.string()).optional().describe("Array of absolute file paths (alternative to directory)"), model: z .enum(["pipeline", "vlm"]) .optional() .describe("pipeline=fast, vlm=90% accuracy"), formula: z.boolean().optional().describe("Formula recognition"), table: z.boolean().optional().describe("Table recognition"), language: z.string().optional().describe("Language code: ch, en, etc"), formats: z .array(z.enum(["docx", "html", "latex"])) .optional() .describe("Extra export formats"), }, async (params) => { const supportedExts = new Set([".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"]); // Collect files let filePaths: string[] = []; if (params.files?.length) { filePaths = params.files; } else if (params.directory) { const dir = params.directory; if (!existsSync(dir)) { throw new Error(`Directory not found: ${dir}`); } const entries = readdirSync(dir); filePaths = entries .filter((f) => supportedExts.has(extname(f).toLowerCase())) .map((f) => join(dir, f)); } else { throw new Error("Provide either 'directory' or 'files' parameter."); } if (filePaths.length === 0) { throw new Error("No supported files found."); } if (filePaths.length > 200) { throw new Error(`Found ${filePaths.length} files. Max 200 per batch. Filter or split.`); } // Validate files exist and build request with collision-safe data_ids const fileEntries: Array<{ name: string; data_id: string }> = []; const usedDataIds = new Set<string>(); for (const fp of filePaths) { if (!existsSync(fp)) { throw new Error(`File not found: ${fp}`); } const stats = statSync(fp); if (stats.size > 200 * 1024 * 1024) { throw new Error(`File too large (${(stats.size / 1024 / 1024).toFixed(0)}MB): ${basename(fp)}. Max 200MB.`); } const name = basename(fp); let stem = name.replace(extname(name), "").replace(/[^a-zA-Z0-9_\-\.]/g, "_").slice(0, 128); // Handle data_id collisions let candidate = stem; let counter = 1; while (usedDataIds.has(candidate)) { candidate = `${stem}_${counter++}`; } usedDataIds.add(candidate); fileEntries.push({ name, data_id: candidate }); } // Request upload URLs const requestData: Record<string, unknown> = { files: fileEntries, model_version: params.model || defaultModel, }; if (params.formula !== undefined) requestData.enable_formula = params.formula; if (params.table !== undefined) requestData.enable_table = params.table; if (params.language) requestData.language = params.language; if (params.formats?.length) requestData.extra_formats = params.formats; const result = await mineruRequest<BatchFileUploadResponse>("/file-urls/batch", "POST", requestData); if (result.file_urls.length !== filePaths.length) { throw new Error(`Expected ${filePaths.length} upload URLs, got ${result.file_urls.length}`); } // Upload each file to presigned OSS URLs using native fetch // Presigned URLs are signed WITHOUT Content-Type — axios force-adds it, so use fetch const uploadResults: string[] = []; for (let i = 0; i < filePaths.length; i++) { const fp = filePaths[i]; const uploadUrl = result.file_urls[i]; const fileName = basename(fp); try { const fileData = readFileSync(fp); const resp = await fetch(uploadUrl, { method: "PUT", body: fileData, signal: AbortSignal.timeout(300_000), }); if (!resp.ok) { const body = await resp.text(); uploadResults.push(`FAIL: ${fileName} - HTTP ${resp.status}: ${body.slice(0, 200)}`); } else { uploadResults.push(`OK: ${fileName}`); } } catch (err) { const msg = err instanceof Error ? err.message : String(err); uploadResults.push(`FAIL: ${fileName} - ${msg}`); } } const successCount = uploadResults.filter((r) => r.startsWith("OK")).length; const failCount = uploadResults.filter((r) => r.startsWith("FAIL")).length; let text = `Batch ${result.batch_id}: ${successCount} uploaded, ${failCount} failed.\n`; text += `Parsing starts automatically. Use mineru_batch_status to track.\n`; text += `Use mineru_download_results with this batch_id to get named .md files.\n`; if (failCount > 0) { text += `\nFailed uploads:\n${uploadResults.filter((r) => r.startsWith("FAIL")).join("\n")}`; } return { content: [{ type: "text", text }], }; } ); // Tool 6: mineru_download_results server.tool( "mineru_download_results", "Download batch results and extract markdown files with original filenames. Downloads zips, extracts .md content, and saves as {original_name}.md in output directory.", { batch_id: z.string().describe("Batch ID from mineru_upload_batch or mineru_batch"), output_dir: z.string().describe("Directory to save markdown files"), overwrite: z.boolean().optional().default(false).describe("Overwrite existing files"), }, async (params) => { // Check batch status const batch = await mineruRequest<BatchStatus>( `/extract-results/batch/${params.batch_id}` ); const results = batch.extract_result; const doneResults = results.filter((r) => r.state === "done" && r.full_zip_url); const pendingResults = results.filter((r) => ["pending", "running", "converting"].includes(r.state)); const failedResults = results.filter((r) => r.state === "failed"); if (doneResults.length === 0 && pendingResults.length > 0) { return { content: [{ type: "text", text: `Batch ${params.batch_id}: ${pendingResults.length} still processing, 0 done. Try again later.`, }], }; } // Create output directory mkdirSync(params.output_dir, { recursive: true }); const tmpBase = join(tmpdir(), `mineru-dl-${Date.now()}-${randomBytes(4).toString("hex")}`); mkdirSync(tmpBase, { recursive: true }); const downloaded: string[] = []; const errors: string[] = []; // Depth-limited, symlink-safe .md file finder const findMd = (dir: string, baseDir: string, depth = 0, maxDepth = 5): string | null => { if (depth > maxDepth) return null; const entries = readdirSync(dir, { withFileTypes: true }); for (const entry of entries) { if (entry.isSymbolicLink()) continue; // skip symlinks (zip slip protection) const fullPath = join(dir, entry.name); if (entry.isFile() && entry.name.endsWith(".md")) { // Verify resolved path stays within extraction dir const realPath = realpathSync(fullPath); if (!realPath.startsWith(realpathSync(baseDir))) continue; return fullPath; } if (entry.isDirectory()) { const found = findMd(fullPath, baseDir, depth + 1, maxDepth); if (found) return found; } } return null; }; for (const r of doneResults) { // Prefer data_id (set by us from original filename) over file_name (API-returned, can be stale) const rawName = r.data_id || r.file_name || "unknown"; const safeName = basename(rawName).replace(/[^a-zA-Z0-9_\-\.\s]/g, "_"); const stem = safeName.replace(extname(safeName), "") || "unnamed"; const mdOutputPath = join(params.output_dir, `${stem}.md`); if (!params.overwrite && existsSync(mdOutputPath)) { downloaded.push(`SKIP: ${stem}.md (exists)`); continue; } try { // Download zip via streaming to avoid memory pressure const zipPath = join(tmpBase, `${stem}.zip`); const response = await axios.get(r.full_zip_url!, { responseType: "stream", timeout: 120_000, }); await pipeline(response.data, createWriteStream(zipPath)); // Extract zip using execFileSync (no shell injection) const extractDir = join(tmpBase, stem); mkdirSync(extractDir, { recursive: true }); try { execFileSync("unzip", ["-o", "-q", zipPath, "-d", extractDir], { timeout: 60_000, }); } catch (unzipErr) { const msg = unzipErr instanceof Error ? unzipErr.message : String(unzipErr); errors.push(`UNZIP_FAIL: ${safeName} - ${msg}`); continue; } const mdFile = findMd(extractDir, extractDir); if (mdFile) { const mdContent = readFileSync(mdFile, "utf-8"); writeFileSync(mdOutputPath, mdContent, "utf-8"); downloaded.push(`OK: ${stem}.md`); } else { errors.push(`NO_MD: ${safeName} - no .md file found in zip`); } // Cleanup zip unlinkSync(zipPath); } catch (err) { const msg = err instanceof Error ? err.message : String(err); errors.push(`FAIL: ${safeName} - ${msg}`); } } // Cleanup temp directory try { rmSync(tmpBase, { recursive: true, force: true }); } catch { /* ignore cleanup errors */ } let text = `Downloaded to: ${params.output_dir}\n`; text += `Done: ${downloaded.filter((d) => d.startsWith("OK")).length}`; text += ` | Skipped: ${downloaded.filter((d) => d.startsWith("SKIP")).length}`; text += ` | Failed: ${errors.length}`; if (pendingResults.length > 0) { text += ` | Still processing: ${pendingResults.length}`; } if (failedResults.length > 0) { text += ` | Parse failed: ${failedResults.length}`; } text += `\n\nFiles:\n${downloaded.join("\n")}`; if (errors.length > 0) { text += `\n\nErrors:\n${errors.join("\n")}`; } if (pendingResults.length > 0) { text += `\n\nRe-run this tool to download remaining files once processing completes.`; } return { content: [{ type: "text", text }], }; } ); return server.server; } // Sandbox server for Smithery scanning (no real credentials needed) export function createSandboxServer() { return createServer({ config: { mineruApiKey: "sandbox-key", mineruBaseUrl: "https://mineru.net/api/v4", mineruDefaultModel: "pipeline", }, }); } // STDIO mode (npx, local dev, Claude Code) async function main() { const config: Config = { mineruApiKey: process.env.MINERU_API_KEY || "", mineruBaseUrl: process.env.MINERU_BASE_URL || "https://mineru.net/api/v4", mineruDefaultModel: (process.env.MINERU_DEFAULT_MODEL as "pipeline" | "vlm") || "pipeline", }; const server = createServer({ config }); const transport = new StdioServerTransport(); await server.connect(transport); console.error("MinerU MCP server running (stdio mode)"); } // Only run stdio when executed directly (not when imported by Smithery CLI) const isDirectRun = process.argv[1] && ( process.argv[1].endsWith('index.js') || process.argv[1].endsWith('index.ts') ); if (isDirectRun) { main().catch((error) => { console.error("Fatal error:", error); process.exit(1); }); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/linxule/mineru-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•22.2 KiB