#!/usr/bin/env node
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import {
CallToolRequestSchema,
ListToolsRequestSchema,
ReadResourceRequestSchema,
ListResourcesRequestSchema,
} from "@modelcontextprotocol/sdk/types.js";
import fs from 'fs-extra';
import path from 'path';
import crypto from 'crypto';
import mammoth from 'mammoth';
import Tesseract from 'tesseract.js';
import NodeCache from 'node-cache';
import sharp from 'sharp';
import { Worker, isMainThread, parentPort, workerData } from 'worker_threads';
import { fileURLToPath } from 'url';
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const server = new Server(
{
name: "word-doc-reader",
version: "2.0.0",
},
{
capabilities: {
tools: {},
resources: {},
},
}
);
// 缓存配置
const documentCache = new NodeCache({
stdTTL: 3600, // 1小时缓存
checkperiod: 600, // 每10分钟检查过期缓存
useClones: false
});
// 全文索引
class DocumentIndexer {
constructor() {
this.index = new Map(); // word -> Set of documentIds
this.documents = new Map(); // documentId -> document metadata
this.lastUpdated = Date.now();
}
// 添加文档到索引
addDocument(documentId, content, metadata) {
const words = this.extractWords(content);
// 更新倒排索引
words.forEach(word => {
if (!this.index.has(word)) {
this.index.set(word, new Set());
}
this.index.get(word).add(documentId);
});
// 存储文档元数据
this.documents.set(documentId, {
...metadata,
wordCount: words.length,
lastIndexed: Date.now()
});
this.lastUpdated = Date.now();
}
// 提取词汇
extractWords(text) {
// 中英文分词
const chinesePattern = /[\u4e00-\u9fff]+/g;
const englishPattern = /[a-zA-Z]+/g;
const numberPattern = /\d+/g;
const words = [];
// 提取中文词汇(简单分词,后续可优化为更智能的分词)
const chineseMatches = text.match(chinesePattern) || [];
chineseMatches.forEach(match => {
// 简单的中文二字、三字、四字词分词
for (let i = 0; i < match.length; i++) {
for (let len = 2; len <= 4 && i + len <= match.length; len++) {
words.push(match.substr(i, len));
}
}
});
// 提取英文单词
const englishMatches = text.match(englishPattern) || [];
words.push(...englishMatches.map(word => word.toLowerCase()));
// 提取数字
const numberMatches = text.match(numberPattern) || [];
words.push(...numberMatches);
return words.filter(word => word.length > 0);
}
// 搜索文档
search(query) {
const queryWords = this.extractWords(query.toLowerCase());
const documentScores = new Map();
queryWords.forEach(word => {
const docs = this.index.get(word);
if (docs) {
docs.forEach(docId => {
const score = documentScores.get(docId) || 0;
documentScores.set(docId, score + 1);
});
}
});
// 按相关性排序
const results = Array.from(documentScores.entries())
.sort((a, b) => b[1] - a[1])
.map(([docId, score]) => ({
documentId,
score,
document: this.documents.get(docId)
}));
return results;
}
// 清除索引
clear() {
this.index.clear();
this.documents.clear();
this.lastUpdated = Date.now();
}
// 获取统计信息
getStats() {
return {
totalWords: this.index.size,
totalDocuments: this.documents.size,
lastUpdated: this.lastUpdated
};
}
}
const documentIndexer = new DocumentIndexer();
// 大文档处理器
class LargeDocumentProcessor {
constructor() {
this.maxFileSize = 10 * 1024 * 1024; // 10MB
this.maxPages = 100;
}
async isLargeDocument(filePath) {
const stats = await fs.stat(filePath);
return stats.size > this.maxFileSize;
}
async processInChunks(filePath, options = {}) {
const chunkSize = options.chunkSize || 1024 * 1024; // 1MB chunks
const buffer = await fs.readFile(filePath);
const chunks = [];
for (let i = 0; i < buffer.length; i += chunkSize) {
chunks.push(buffer.slice(i, i + chunkSize));
}
// 并行处理chunks
const promises = chunks.map((chunk, index) =>
this.processChunk(chunk, index, options)
);
const results = await Promise.all(promises);
return this.mergeResults(results);
}
async processChunk(chunk, index, options) {
// 使用worker处理每个chunk
return new Promise((resolve, reject) => {
const worker = new Worker(__filename, {
workerData: {
type: 'process_chunk',
chunk: chunk.toString('base64'),
index,
options
}
});
worker.on('message', resolve);
worker.on('error', reject);
});
}
mergeResults(results) {
const merged = {
text: '',
tables: [],
images: [],
metadata: {
chunksProcessed: results.length,
totalLength: 0
}
};
results.forEach(result => {
merged.text += result.text || '';
if (result.tables) merged.tables.push(...result.tables);
if (result.images) merged.images.push(...result.images);
merged.metadata.totalLength += result.text?.length || 0;
});
return merged;
}
}
const largeDocProcessor = new LargeDocumentProcessor();
// 文档分析器(包含表格和图片提取)
class DocumentAnalyzer {
constructor() {
this.ocrWorker = null;
}
async initializeOCR() {
if (!this.ocrWorker) {
this.ocrWorker = await Tesseract.createWorker();
await this.ocrWorker.loadLanguage('chi_sim+eng');
await this.ocrWorker.initialize('chi_sim+eng');
}
}
async extractTables(docxBuffer) {
try {
// 使用mammoth提取HTML,然后解析表格
const result = await mammoth.convertToHtml({ buffer: docxBuffer });
const html = result.value;
// 简单的HTML表格解析
const tables = [];
const tableRegex = /<table[^>]*>([\s\S]*?)<\/table>/gi;
let match;
while ((match = tableRegex.exec(html)) !== null) {
const tableHtml = match[0];
const tableData = this.parseTableHtml(tableHtml);
if (tableData.rows.length > 0) {
tables.push(tableData);
}
}
return tables;
} catch (error) {
console.error('表格提取错误:', error);
return [];
}
}
parseTableHtml(tableHtml) {
const rows = [];
const rowRegex = /<tr[^>]*>([\s\S]*?)<\/tr>/gi;
let rowMatch;
while ((rowMatch = rowRegex.exec(tableHtml)) !== null) {
const cells = [];
const cellRegex = /<(?:t[dh])(?:[^>]*)>([\s\S]*?)<\/t[dh]>/gi;
let cellMatch;
while ((cellMatch = cellRegex.exec(rowMatch[1])) !== null) {
// 清理HTML标签和多余空白
const cellText = cellMatch[1]
.replace(/<[^>]*>/g, '')
.replace(/\s+/g, ' ')
.trim();
cells.push(cellText);
}
if (cells.length > 0) {
rows.push(cells);
}
}
return { rows, html: tableHtml };
}
async extractImages(docxBuffer, outputDir) {
try {
// 创建临时目录
const tempDir = path.join(outputDir, 'temp_images');
await fs.ensureDir(tempDir);
// 使用zip-local解压docx文件(docx实际上是zip文件)
const JSZip = await import('jszip');
const zip = await JSZip.loadAsync(docxBuffer);
const images = [];
const mediaFolder = zip.folder('word/media');
if (mediaFolder) {
const imageFiles = mediaFolder.files;
for (const [filename, file] of Object.entries(imageFiles)) {
if (!file.dir && this.isImageFile(filename)) {
// 保存图片
const imagePath = path.join(tempDir, path.basename(filename));
const imageBuffer = await file.async('nodebuffer');
await fs.writeFile(imagePath, imageBuffer);
// 进行OCR分析
const ocrText = await this.performOCR(imageBuffer);
images.push({
filename,
path: imagePath,
size: imageBuffer.length,
format: path.extname(filename).substring(1),
ocrText: ocrText,
extractedAt: new Date().toISOString()
});
}
}
}
return images;
} catch (error) {
console.error('图片提取错误:', error);
return [];
}
}
isImageFile(filename) {
const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp'];
const ext = path.extname(filename).toLowerCase();
return imageExtensions.includes(ext);
}
async performOCR(imageBuffer) {
try {
await this.initializeOCR();
// 图片预处理
const processedImage = await sharp(imageBuffer)
.resize({ width: 2000, withoutEnlargement: true })
.sharpen()
.normalise()
.png()
.toBuffer();
const { data: { text } } = await this.ocrWorker.recognize(processedImage);
return text.trim();
} catch (error) {
console.error('OCR处理错误:', error);
return '';
}
}
async cleanup() {
if (this.ocrWorker) {
await this.ocrWorker.terminate();
this.ocrWorker = null;
}
}
}
const documentAnalyzer = new DocumentAnalyzer();
// 缓存管理器
class CacheManager {
constructor() {
this.cacheDir = path.join(__dirname, '.cache');
this.metadataFile = path.join(this.cacheDir, 'metadata.json');
}
async initialize() {
await fs.ensureDir(this.cacheDir);
if (!(await fs.pathExists(this.metadataFile))) {
await fs.writeJson(this.metadataFile, {});
}
}
getCacheKey(filePath) {
const stats = fs.statSync(filePath);
const keyData = `${filePath}-${stats.mtime.getTime()}-${stats.size}`;
return crypto.createHash('md5').update(keyData).digest('hex');
}
async get(filePath) {
await this.initialize();
const cacheKey = this.getCacheKey(filePath);
const cacheFile = path.join(this.cacheDir, `${cacheKey}.json`);
const metadata = await fs.readJson(this.metadataFile);
if (metadata[cacheKey] && await fs.pathExists(cacheFile)) {
return await fs.readJson(cacheFile);
}
return null;
}
async set(filePath, data) {
await this.initialize();
const cacheKey = this.getCacheKey(filePath);
const cacheFile = path.join(this.cacheDir, `${cacheKey}.json`);
const metadata = await fs.readJson(this.metadataFile);
await fs.writeJson(cacheFile, data);
metadata[cacheKey] = {
filePath,
cachedAt: new Date().toISOString(),
size: JSON.stringify(data).length
};
await fs.writeJson(this.metadataFile, metadata);
}
async clear() {
await this.initialize();
const files = await fs.readdir(this.cacheDir);
for (const file of files) {
if (file !== 'metadata.json') {
await fs.remove(path.join(this.cacheDir, file));
}
}
await fs.writeJson(this.metadataFile, {});
}
async getStats() {
await this.initialize();
const metadata = await fs.readJson(this.metadataFile);
const keys = Object.keys(metadata);
return {
totalCached: keys.length,
totalSize: keys.reduce((sum, key) => sum + (metadata[key].size || 0), 0),
files: metadata
};
}
}
const cacheManager = new CacheManager();
// Worker线程处理
if (!isMainThread) {
const { type, data } = workerData;
if (type === 'process_chunk') {
const { chunk, index, options } = data;
mammoth.extractRawText({ buffer: Buffer.from(chunk, 'base64') })
.then(result => {
parentPort.postMessage({
index,
text: result.value,
success: true
});
})
.catch(error => {
parentPort.postMessage({
index,
error: error.message,
success: false
});
});
}
}
// 工具定义
server.setRequestHandler(ListToolsRequestSchema, async () => {
return {
tools: [
{
name: "read_word_document",
description: "增强版Word文档读取器,支持表格提取、图片OCR分析和缓存优化",
inputSchema: {
type: "object",
properties: {
filePath: {
type: "string",
description: "Word文档的文件路径"
},
memoryKey: {
type: "string",
description: "用于存储的内存键名,便于后续检索",
default: "default"
},
documentType: {
type: "string",
description: "文档类型",
enum: ["ui-component", "api-doc", "common-doc", "other"],
default: "common-doc"
},
extractTables: {
type: "boolean",
description: "是否提取表格",
default: true
},
extractImages: {
type: "boolean",
description: "是否提取图片并进行OCR分析",
default: true
},
useCache: {
type: "boolean",
description: "是否使用缓存",
default: true
},
outputDir: {
type: "string",
description: "图片和临时文件输出目录",
default: "./output"
}
},
required: ["filePath"]
}
},
{
name: "search_documents",
description: "全文索引搜索,支持中英文混合搜索",
inputSchema: {
type: "object",
properties: {
query: {
type: "string",
description: "搜索关键词"
},
documentType: {
type: "string",
description: "限制搜索的文档类型",
enum: ["ui-component", "api-doc", "common-doc", "other"]
},
limit: {
type: "number",
description: "返回结果数量限制",
default: 10
}
},
required: ["query"]
}
},
{
name: "get_cache_stats",
description: "获取缓存统计信息",
inputSchema: {
type: "object",
properties: {}
}
},
{
name: "clear_cache",
description: "清空所有缓存",
inputSchema: {
type: "object",
properties: {
type: {
type: "string",
description: "清除类型:all, document, index",
enum: ["all", "document", "index"],
default: "all"
}
}
}
},
{
name: "list_stored_documents",
description: "列出所有已存储的文档",
inputSchema: {
type: "object",
properties: {
documentType: {
type: "string",
description: "筛选特定类型的文档",
enum: ["ui-component", "api-doc", "common-doc", "other"]
}
}
}
},
{
name: "get_stored_document",
description: "获取已存储的文档内容",
inputSchema: {
type: "object",
properties: {
memoryKey: {
type: "string",
description: "要获取的文档内存键名"
}
},
required: ["memoryKey"]
}
},
{
name: "clear_memory",
description: "清除指定的内存内容",
inputSchema: {
type: "object",
properties: {
memoryKey: {
type: "string",
description: "要清除的内存键名,如果不提供则清除所有"
}
}
}
}
]
};
});
// 工具执行
server.setRequestHandler(CallToolRequestSchema, async (request) => {
const { name, arguments: args } = request.params;
try {
switch (name) {
case "read_word_document": {
const {
filePath,
memoryKey = "default",
documentType = "common-doc",
extractTables = true,
extractImages = true,
useCache = true,
outputDir = "./output"
} = args;
if (!fs.existsSync(filePath)) {
throw new Error(`文件不存在: ${filePath}`);
}
const fileExt = path.extname(filePath).toLowerCase();
if (fileExt !== '.docx' && fileExt !== '.doc') {
throw new Error(`不支持的文件格式: ${fileExt}。仅支持 .docx 和 .doc 文件`);
}
// 检查缓存
let analysisResult = null;
if (useCache) {
analysisResult = await cacheManager.get(filePath);
if (analysisResult) {
console.log(`使用缓存数据: ${filePath}`);
}
}
if (!analysisResult) {
console.log(`分析文档: ${filePath}`);
// 读取文件
const docxBuffer = await fs.readFile(filePath);
// 检查是否为大文档
if (await largeDocProcessor.isLargeDocument(filePath)) {
console.log('检测到大文档,使用并行处理');
analysisResult = await largeDocProcessor.processInChunks(filePath, {
extractTables,
extractImages
});
} else {
// 标准处理
const textResult = await mammoth.extractRawText({ buffer: docxBuffer });
analysisResult = {
text: textResult.value,
tables: [],
images: [],
metadata: {
filePath,
documentType,
processedAt: new Date().toISOString(),
fileSize: docxBuffer.length
}
};
// 提取表格
if (extractTables) {
analysisResult.tables = await documentAnalyzer.extractTables(docxBuffer);
}
// 提取图片
if (extractImages) {
await fs.ensureDir(outputDir);
analysisResult.images = await documentAnalyzer.extractImages(docxBuffer, outputDir);
}
}
// 缓存结果
if (useCache) {
await cacheManager.set(filePath, analysisResult);
}
}
// 生成文档ID用于索引
const documentId = `${memoryKey}_${Date.now()}`;
// 更新全文索引
const indexContent = analysisResult.text +
' ' + analysisResult.tables.map(t => t.rows.join(' ')).join(' ') +
' ' + analysisResult.images.map(i => i.ocrText).join(' ');
documentIndexer.addDocument(documentId, indexContent, {
memoryKey,
filePath,
documentType,
tablesCount: analysisResult.tables.length,
imagesCount: analysisResult.images.length
});
// 存储到内存(兼容现有功能)
const documentData = {
...analysisResult,
documentId,
memoryKey,
filePath,
documentType,
timestamp: new Date().toISOString()
};
documentCache.set(memoryKey, documentData);
// 构建响应
let responseText = `成功读取并分析Word文档:\n`;
responseText += `文件路径: ${filePath}\n`;
responseText += `文档类型: ${documentType}\n`;
responseText += `内存键: ${memoryKey}\n`;
responseText += `内容长度: ${analysisResult.text.length} 字符\n`;
responseText += `表格数量: ${analysisResult.tables.length}\n`;
responseText += `图片数量: ${analysisResult.images.length}\n`;
if (analysisResult.tables.length > 0) {
responseText += `\n表格预览:\n`;
analysisResult.tables.slice(0, 2).forEach((table, index) => {
responseText += `表格${index + 1}: ${table.rows.length}行 x ${table.rows[0]?.length || 0}列\n`;
if (table.rows.length > 0) {
responseText += `示例行: ${table.rows[0]?.slice(0, 3).join(' | ')}\n`;
}
});
}
if (analysisResult.images.length > 0) {
responseText += `\n图片OCR结果预览:\n`;
analysisResult.images.slice(0, 2).forEach((image, index) => {
const ocrPreview = image.ocrText.substring(0, 100);
responseText += `图片${index + 1} (${image.filename}): ${ocrPreview}${image.ocrText.length > 100 ? '...' : ''}\n`;
});
}
responseText += `\n文本内容预览:\n${analysisResult.text.substring(0, 300)}...`;
return {
content: [
{
type: "text",
text: responseText
}
]
};
}
case "search_documents": {
const { query, documentType, limit = 10 } = args;
const searchResults = documentIndexer.search(query);
// 按文档类型过滤
const filteredResults = documentType
? searchResults.filter(result => result.document.documentType === documentType)
: searchResults;
const limitedResults = filteredResults.slice(0, limit);
if (limitedResults.length === 0) {
return {
content: [
{
type: "text",
text: `未找到包含关键词 "${query}" 的文档`
}
]
};
}
const resultsText = limitedResults.map((result, index) => {
const doc = result.document;
return `${index + 1}. 相关度: ${result.score}\n 内存键: ${doc.memoryKey}\n 文件: ${doc.filePath}\n 类型: ${doc.documentType}\n 表格数: ${doc.tablesCount}\n 图片数: ${doc.imagesCount}\n 最后索引: ${new Date(doc.lastIndexed).toLocaleString()}`;
}).join('\n\n');
return {
content: [
{
type: "text",
text: `搜索结果 "${query}" (找到 ${limitedResults.length} 个匹配,共 ${filteredResults.length} 个):\n\n${resultsText}`
}
]
};
}
case "get_cache_stats": {
const documentStats = await cacheManager.getStats();
const indexStats = documentIndexer.getStats();
const statsText = `缓存统计信息:\n\n` +
`文档缓存:\n` +
`- 缓存文件数: ${documentStats.totalCached}\n` +
`- 总缓存大小: ${(documentStats.totalSize / 1024 / 1024).toFixed(2)} MB\n\n` +
`全文索引:\n` +
`- 索引词汇数: ${indexStats.totalWords}\n` +
`- 索引文档数: ${indexStats.totalDocuments}\n` +
`- 最后更新: ${new Date(indexStats.lastUpdated).toLocaleString()}`;
return {
content: [
{
type: "text",
text: statsText
}
]
};
}
case "clear_cache": {
const { type = "all" } = args;
switch (type) {
case "document":
await cacheManager.clear();
var clearedMessage = "已清空文档缓存";
break;
case "index":
documentIndexer.clear();
var clearedMessage = "已清空全文索引";
break;
case "all":
default:
await cacheManager.clear();
documentIndexer.clear();
documentCache.flushAll();
var clearedMessage = "已清空所有缓存(文档缓存、全文索引、内存缓存)";
break;
}
return {
content: [
{
type: "text",
text: clearedMessage
}
]
};
}
case "list_stored_documents": {
const { documentType } = args;
const docs = [];
const keys = documentCache.keys();
for (const key of keys) {
const doc = documentCache.get(key);
if (!documentType || doc.documentType === documentType) {
docs.push(doc);
}
}
const docList = docs.map(doc =>
`- 内存键: ${doc.memoryKey}\n 文件路径: ${doc.filePath}\n 文档类型: ${doc.documentType}\n 存储时间: ${doc.timestamp}\n 内容长度: ${doc.text?.length || 0} 字符\n 表格数: ${doc.tables?.length || 0}\n 图片数: ${doc.images?.length || 0}`
).join('\n\n');
return {
content: [
{
type: "text",
text: `已存储的文档 (${docs.length} 个):\n\n${docList || "暂无存储的文档"}`
}
]
};
}
case "get_stored_document": {
const { memoryKey } = args;
const doc = documentCache.get(memoryKey);
if (!doc) {
throw new Error(`未找到内存键为 "${memoryKey}" 的文档`);
}
let responseText = `文档内容 (内存键: ${memoryKey}):\n\n`;
responseText += `文件路径: ${doc.filePath}\n`;
responseText += `文档类型: ${doc.documentType}\n`;
responseText += `处理时间: ${doc.timestamp}\n\n`;
// 添加文本内容
if (doc.text) {
responseText += `【文本内容】\n${doc.text}\n\n`;
}
// 添加表格内容
if (doc.tables && doc.tables.length > 0) {
responseText += `【表格内容】(${doc.tables.length} 个)\n`;
doc.tables.forEach((table, index) => {
responseText += `\n表格${index + 1}:\n`;
table.rows.forEach((row, rowIndex) => {
responseText += `行${rowIndex + 1}: ${row.join(' | ')}\n`;
});
});
}
// 添加图片OCR内容
if (doc.images && doc.images.length > 0) {
responseText += `【图片OCR内容】(${doc.images.length} 个)\n`;
doc.images.forEach((image, index) => {
responseText += `\n图片${index + 1} (${image.filename}):\n${image.ocrText}\n`;
});
}
return {
content: [
{
type: "text",
text: responseText
}
]
};
}
case "clear_memory": {
const { memoryKey } = args;
if (memoryKey) {
const removed = documentCache.del(memoryKey);
if (removed > 0) {
return {
content: [
{
type: "text",
text: `已清除内存键 "${memoryKey}" 的内容`
}
]
};
} else {
throw new Error(`未找到内存键 "${memoryKey}" 的内容`);
}
} else {
// 清除所有内存
const count = documentCache.keys().length;
documentCache.flushAll();
return {
content: [
{
type: "text",
text: `已清除所有内存内容 (共 ${count} 个文档)`
}
]
};
}
}
default:
throw new Error(`未知工具: ${name}`);
}
} catch (error) {
console.error('工具执行错误:', error);
return {
content: [
{
type: "text",
text: `错误: ${error.message}`
}
],
isError: true
};
}
});
// 启动服务器
async function main() {
const transport = new StdioServerTransport();
await server.connect(transport);
console.error("Word Document Reader MCP server running on stdio");
}
// 优雅关闭处理
process.on('SIGINT', async () => {
console.error('\n正在关闭服务器...');
await documentAnalyzer.cleanup();
process.exit(0);
});
process.on('SIGTERM', async () => {
console.error('\n正在关闭服务器...');
await documentAnalyzer.cleanup();
process.exit(0);
});
// 导出类供测试使用
export { DocumentIndexer, CacheManager, LargeDocumentProcessor, DocumentAnalyzer };
main().catch((error) => {
console.error("服务器启动失败:", error);
process.exit(1);
});