PromptX

pdf-reader.tool.js•16.5 KiB

/** * PDF Reader - 纯 Node.js 实现的 PDF 分页阅读工具（带智能缓存） * * 战略意义： * 1. 按需阅读：像人类读书一样，指定页码按需读取 * 2. 智能缓存：已解析的页面不重复处理，节省算力 * 3. 性能优化：避免大 PDF 一次性加载导致宕机 * 4. Token 优化：只返回需要的页面内容，不浪费上下文 * * 设计理念： * - 不传 pages：只返回 PDF 元信息（总页数、标题等） * - 传 pages：提取指定页的文本和图片 * - 图片存储在 toolbox 目录，统一管理 * - 使用 storage 缓存解析状态，避免重复解析 * * 生态定位： * 为 AI 提供高效的 PDF 分页阅读能力 * 与 Claude Code 的 Read 工具无缝配合 */ module.exports = { getDependencies() { return { 'pdf-parse': '^2.1.10' }; }, getMetadata() { return { id: 'pdf-reader', name: 'PDF Reader', description: 'PDF 分页阅读工具，支持按页码提取文本和图片，智能缓存避免重复解析', version: '2.1.0', author: '鲁班' }; }, getSchema() { return { parameters: { type: 'object', properties: { pdfPath: { type: 'string', description: 'PDF 文件的绝对路径', minLength: 1 }, pages: { oneOf: [ { type: 'number', minimum: 1 }, { type: 'array', items: { type: 'number', minimum: 1 } } ], description: '要读取的页码（可选）。不传则只返回 PDF 元信息；传数字读取单页；传数组读取多页' }, extractImages: { type: 'boolean', description: '是否提取图片（默认 true）', default: true }, forceRefresh: { type: 'boolean', description: '强制重新解析，忽略缓存（默认 false）', default: false } }, required: ['pdfPath'] } }; }, getBridges() { return { 'pdf:parse': { real: async (args, api) => { api.logger.info('[Bridge] Parsing PDF with pdf-parse'); const { PDFParse } = await api.importx('pdf-parse'); const parser = new PDFParse({ data: args.buffer }); return parser; }, mock: async (args, api) => { api.logger.debug('[Mock] Creating mock PDF parser'); return { getText: async () => ({ text: 'Mock PDF full content\nPage 1 content here\nPage 2 content here\nPage 3 content here', total: 3, info: { Title: 'Mock PDF Document', Author: 'Test Author' }, metadata: null, version: '1.0' }), getImage: async () => ({ pages: [ { pageNumber: 1, images: [ { fileName: 'img-0', data: Buffer.from('fake-image-data-page-1'), width: 800, height: 600 } ] }, { pageNumber: 2, images: [ { fileName: 'img-0', data: Buffer.from('fake-image-data-page-2'), width: 1024, height: 768 } ] }, { pageNumber: 3, images: [] } ] }) }; } }, 'fs:readFile': { real: async (args, api) => { api.logger.info(`[Bridge] Reading file: ${args.path}`); const fs = await api.importx('fs'); return await fs.promises.readFile(args.path); }, mock: async (args, api) => { api.logger.debug(`[Mock] Mock reading file: ${args.path}`); return Buffer.from('mock-pdf-binary-data'); } }, 'fs:writeFile': { real: async (args, api) => { api.logger.info(`[Bridge] Writing file: ${args.path}`); const fs = await api.importx('fs'); await fs.promises.writeFile(args.path, args.data); }, mock: async (args, api) => { api.logger.debug(`[Mock] Mock writing file: ${args.path}`); } }, 'fs:mkdir': { real: async (args, api) => { api.logger.info(`[Bridge] Creating directory: ${args.path}`); const fs = await api.importx('fs'); await fs.promises.mkdir(args.path, { recursive: true }); }, mock: async (args, api) => { api.logger.debug(`[Mock] Mock creating directory: ${args.path}`); } }, 'fs:exists': { real: async (args, api) => { const fs = await api.importx('fs'); try { await fs.promises.access(args.path); return true; } catch { return false; } }, mock: async (args, api) => { api.logger.debug(`[Mock] Checking file exists: ${args.path}`); return false; // Mock 总是返回不存在，这样会走解析流程 } }, 'crypto:hash': { real: async (args, api) => { const crypto = await api.importx('crypto'); const hash = crypto.createHash('md5'); hash.update(args.data); return hash.digest('hex'); }, mock: async (args, api) => { api.logger.debug('[Mock] Generating mock hash'); return 'mock-hash-' + Date.now(); } }, 'path:join': { real: async (args, api) => { const path = await api.importx('path'); return path.join(...args.parts); }, mock: async (args, api) => { return args.parts.join('/'); } } }; }, getBridgeErrors() { return { 'pdf:parse': [ { code: 'INVALID_PDF', match: /Invalid PDF/i, solution: '检查 PDF 文件是否损坏', retryable: false } ], 'fs:readFile': [ { code: 'FILE_NOT_FOUND', match: /ENOENT/, solution: '检查 PDF 文件路径是否正确', retryable: false } ] }; }, async execute(params) { const { api } = this; const { pdfPath, pages, extractImages = true, forceRefresh = false } = params; try { api.logger.info('开始处理 PDF', { pdfPath, pages, extractImages, forceRefresh }); // 1. 读取 PDF 文件 const pdfBuffer = await api.bridge.execute('fs:readFile', { path: pdfPath }); api.logger.info('PDF 文件读取成功', { size: pdfBuffer.length }); // 2. 生成 PDF hash const pdfHash = await api.bridge.execute('crypto:hash', { data: pdfBuffer }); // 3. 获取工具信息，构建 PDF 数据目录 const toolInfo = api.getInfo(); const pdfDir = await api.bridge.execute('path:join', { parts: [toolInfo.sandboxPath, 'pdfs', pdfHash] }); api.logger.info('PDF 数据目录', { pdfDir }); // 4. 检查 storage 中的 PDF 元信息缓存 const allPdfs = api.storage.getItem('pdfs') || {}; let pdfMetadata = allPdfs[pdfHash]; api.logger.info('Storage 读取结果', { hasPdfs: !!allPdfs, pdfKeys: Object.keys(allPdfs), hasPdfMetadata: !!pdfMetadata, cachedPages: pdfMetadata ? Object.keys(pdfMetadata.parsedPages || {}) : [] }); if (!pdfMetadata || forceRefresh) { // 首次处理此 PDF，解析基本信息 api.logger.info('首次处理此 PDF，解析元信息'); const parser = await api.bridge.execute('pdf:parse', { buffer: pdfBuffer }); const basicInfo = await parser.getText(); const totalPages = basicInfo.total; pdfMetadata = { totalPages: totalPages, title: basicInfo.info?.Title || null, author: basicInfo.info?.Author || null, pdfHash: pdfHash, pdfPath: pdfPath, createdAt: Date.now(), parsedPages: {} }; allPdfs[pdfHash] = pdfMetadata; api.storage.setItem('pdfs', allPdfs); api.logger.info('PDF 元信息已缓存', { totalPages }); } else { api.logger.info('使用缓存的 PDF 元信息', { totalPages: pdfMetadata.totalPages, cachedPages: Object.keys(pdfMetadata.parsedPages || {}).length }); } const totalPages = pdfMetadata.totalPages; // 5. 如果没有传 pages，只返回元信息 if (pages === undefined || pages === null) { return { success: true, metadata: { totalPages: pdfMetadata.totalPages, title: pdfMetadata.title, author: pdfMetadata.author, cachedPages: Object.keys(pdfMetadata.parsedPages || {}).length }, message: '使用 pages 参数指定要读取的页码，例如：pages: 1 或 pages: [1,2,3]' }; } // 6. 标准化 pages 为数组 const pageNumbers = Array.isArray(pages) ? pages : [pages]; // 验证页码范围 for (const pageNum of pageNumbers) { if (pageNum < 1 || pageNum > totalPages) { throw new Error(`页码 ${pageNum} 超出范围（总页数：${totalPages}）`); } } api.logger.info('准备读取指定页面', { pageNumbers }); // 7. 创建 PDF 数据目录 await api.bridge.execute('fs:mkdir', { path: pdfDir }); // 8. 提取指定页面的内容（使用缓存） const pagesContent = []; const allImages = []; let fullText = ''; let cacheHits = 0; let cacheMisses = 0; // 只在有 cache miss 时才解析整个 PDF let imageResult = null; let textLines = null; for (const pageNum of pageNumbers) { api.logger.info(`处理第 ${pageNum} 页`); // 检查缓存 const pageCache = pdfMetadata.parsedPages?.[pageNum]; const shouldUseCache = pageCache && !forceRefresh; if (shouldUseCache && extractImages) { // 验证缓存的图片文件是否存在 api.logger.info(`检查第 ${pageNum} 页缓存`, { hasPageCache: !!pageCache, imageCount: pageCache?.imageCount }); let allImagesExist = true; for (let i = 0; i < (pageCache.imageCount || 0); i++) { const imgPath = await api.bridge.execute('path:join', { parts: [pdfDir, `page-${pageNum}-img-${i}.png`] }); const exists = await api.bridge.execute('fs:exists', { path: imgPath }); api.logger.info(`图片文件检查`, { imgPath, exists }); if (!exists) { allImagesExist = false; api.logger.warn(`缓存的图片不存在: ${imgPath}`); break; } } if (allImagesExist) { // 使用缓存 cacheHits++; api.logger.info(`第 ${pageNum} 页使用缓存`, { imageCount: pageCache.imageCount }); const pageImages = []; for (let i = 0; i < (pageCache.imageCount || 0); i++) { const imgPath = await api.bridge.execute('path:join', { parts: [pdfDir, `page-${pageNum}-img-${i}.png`] }); pageImages.push({ page: pageNum, index: i, path: imgPath, width: null, height: null }); allImages.push(pageImages[pageImages.length - 1]); } pagesContent.push({ page: pageNum, hasImages: pageImages.length > 0, imageCount: pageImages.length, images: pageImages, fromCache: true }); // 文本部分需要重新提取（因为文本没有缓存文件） if (!textLines) { const parser = await api.bridge.execute('pdf:parse', { buffer: pdfBuffer }); const basicInfo = await parser.getText(); textLines = basicInfo.text.split('\n'); } const estimatedLinesPerPage = Math.ceil(textLines.length / totalPages); const startLine = (pageNum - 1) * estimatedLinesPerPage; const endLine = pageNum * estimatedLinesPerPage; const pageText = textLines.slice(startLine, endLine).join('\n'); fullText += `\n=== 第 ${pageNum} 页 ===\n${pageText}\n`; continue; } } // 缓存未命中，需要解析 cacheMisses++; api.logger.info(`第 ${pageNum} 页缓存未命中，开始解析`); // 延迟解析：只在第一次 cache miss 时解析整个 PDF if (!imageResult) { const parser = await api.bridge.execute('pdf:parse', { buffer: pdfBuffer }); imageResult = await parser.getImage(); const basicInfo = await parser.getText(); textLines = basicInfo.text.split('\n'); api.logger.info('PDF 解析完成'); } // 从 imageResult 中找到对应页面 const pageData = imageResult.pages.find(p => p.pageNumber === pageNum); if (!pageData) { api.logger.warn(`第 ${pageNum} 页没有找到数据`); continue; } const pageImages = []; // 处理该页的图片 if (extractImages && pageData.images && pageData.images.length > 0) { for (let i = 0; i < pageData.images.length; i++) { const img = pageData.images[i]; const imgFileName = `page-${pageNum}-img-${i}.png`; const imgPath = await api.bridge.execute('path:join', { parts: [pdfDir, imgFileName] }); // 保存图片 await api.bridge.execute('fs:writeFile', { path: imgPath, data: img.data }); const imageInfo = { page: pageNum, index: i, path: imgPath, width: img.width || null, height: img.height || null }; pageImages.push(imageInfo); allImages.push(imageInfo); } api.logger.info(`第 ${pageNum} 页图片处理完成`, { imageCount: pageImages.length }); } // 缓存页面信息 // 重新从 storage 读取以确保数据最新 const currentPdfs = api.storage.getItem('pdfs') || {}; const currentMetadata = currentPdfs[pdfHash] || pdfMetadata; if (!currentMetadata.parsedPages) { currentMetadata.parsedPages = {}; } currentMetadata.parsedPages[pageNum] = { timestamp: Date.now(), imageCount: pageImages.length, hasText: true }; currentPdfs[pdfHash] = currentMetadata; api.storage.setItem('pdfs', currentPdfs); // 更新本地引用 pdfMetadata = currentMetadata; pagesContent.push({ page: pageNum, hasImages: pageImages.length > 0, imageCount: pageImages.length, images: pageImages, fromCache: false }); // 提取文本 const estimatedLinesPerPage = Math.ceil(textLines.length / totalPages); const startLine = (pageNum - 1) * estimatedLinesPerPage; const endLine = pageNum * estimatedLinesPerPage; const pageText = textLines.slice(startLine, endLine).join('\n'); fullText += `\n=== 第 ${pageNum} 页 ===\n${pageText}\n`; } // 9. 构建返回结果 const result = { success: true, metadata: { totalPages: totalPages, title: pdfMetadata.title, author: pdfMetadata.author, requestedPages: pageNumbers }, content: { text: fullText.trim(), pages: pagesContent, hasImages: allImages.length > 0, imageCount: allImages.length, imagesDirectory: pdfDir }, cache: { hits: cacheHits, misses: cacheMisses, totalCachedPages: Object.keys(pdfMetadata.parsedPages || {}).length } }; api.logger.info('PDF 分页读取完成', { readPages: pageNumbers.length, totalImages: allImages.length, cacheHits, cacheMisses }); return result; } catch (error) { api.logger.error('PDF 处理失败', error); throw error; } } };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Deepractice/PromptX'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf-reader.tool.js•16.5 KiB