MCP Document Parse

Overview Inspect Schema Related Servers Score Discussions

MCP-DocumentParse

file_upload_handler.py•16.7 kB

""" 文档解析MCP工具支持PDF、Word、Excel、PPT等格式转换为Markdown """ import hashlib import os import tempfile import time from datetime import datetime from pathlib import Path from typing import Annotated, Dict import requests from mcp.server import FastMCP from mcp.types import Field from tqdm import tqdm # 全局配置 document_cache: Dict[str, Dict] = {} # 创建MCP服务器实例 mcp = FastMCP("NiuTrans_Document_Parse") class UploadFileWrapper: """模拟FastAPI的UploadFile类""" def __init__(self, file_path: str): self.file_path = file_path self.filename = os.path.basename(file_path) self.file = open(file_path, 'rb') def close(self): """关闭文件""" if hasattr(self, 'file') and not self.file.closed: self.file.close() class DocumentTransClient: """文档转换API客户端（使用小牛翻译API）""" def __init__(self, base_url="https://api.niutrans.com", app_id="", apikey=""): self.base_url = base_url.rstrip('/') self.session = requests.Session() self.app_id = app_id self.apikey = apikey # 初始化API路径 self.convert_url = f"{self.base_url}/v2/doc/convert/uploadConvert" self.status_url = f"{self.base_url}/v2/doc/convert/status/{{file_no}}" self.interrupt_url = f"{self.base_url}/v2/doc/convert/interrupt/{{file_no}}" self.download_url = f"{self.base_url}/v2/doc/convert/download/{{file_no}}" def generate_auth_str(self, params): """生成权限字符串""" sorted_params = sorted(list(params.items()) + [('apikey', self.apikey)], key=lambda x: x[0]) param_str = '&'.join([f'{key}={value}' for key, value in sorted_params]) md5 = hashlib.md5() md5.update(param_str.encode('utf-8')) auth_str = md5.hexdigest() return auth_str def upload_and_convert(self, file, to_file_suffix="markdown", processing_mode=0, from_lang=None): """上传文件并转换""" files = {'file': file} data = { 'from': from_lang or 'zh', 'appId': self.app_id, 'timestamp': int(time.time()), 'toFileSuffix': to_file_suffix } # 生成鉴权字符串 auth_str = self.generate_auth_str(data) data['authStr'] = auth_str try: resp = self.session.post(self.convert_url, files=files, data=data) resp_json = resp.json() print(f"文档转换API上传返回值: {resp_json}") # 检查HTTP状态码 if resp.status_code != 200: error_msg = resp_json.get('msg', f"API返回错误状态码: {resp.status_code}") raise Exception(f"文件上传失败: {error_msg}") # 检查业务逻辑错误 code = resp_json.get('code', 500) if code != 200: error_msg = resp_json.get('msg', '未知错误') raise Exception(f"文件上传失败: {error_msg}") # 检查data是否存在 if 'data' not in resp_json or resp_json['data'] is None: raise Exception("文件上传失败: API返回数据为空") # 获取fileNo file_no = resp_json['data'].get('fileNo') if not file_no: raise Exception("文件上传失败: 未返回有效的fileNo") return file_no except Exception as e: # 如果是已捕获的异常，直接重新抛出 if isinstance(e, Exception) and "文件上传失败" in str(e): raise # 处理其他异常 error_msg = f"文件上传失败: {str(e)}" # 尝试从响应中获取更多错误信息 if 'resp_json' in locals(): error_msg += f"，API响应: {resp_json}" raise Exception(error_msg) def get_document_info(self, file_no): """获取文档信息""" params = { "appId": self.app_id, "timestamp": int(time.time()) } # 生成鉴权字符串 auth_str = self.generate_auth_str(params) params['authStr'] = auth_str # 替换URL中的占位符 url = self.status_url.format(file_no=file_no) try: resp = self.session.get(url, params=params) resp_json = resp.json() print(f"获取文档信息返回值: {resp_json}") # 检查HTTP状态码 if resp.status_code != 200: error_msg = resp_json.get('msg', f"API返回错误状态码: {resp.status_code}") raise Exception(f"获取文档信息失败: {error_msg}") # 检查业务逻辑错误 code = resp_json.get('code', 500) if code != 200: error_msg = resp_json.get('msg', '未知错误') raise Exception(f"获取文档信息失败: {error_msg}") # 检查data是否存在 if 'data' not in resp_json or resp_json['data'] is None: raise Exception("获取文档信息失败: API返回数据为空") return resp_json['data'] except Exception as e: # 如果是已捕获的异常，直接重新抛出 if isinstance(e, Exception) and "获取文档信息失败" in str(e): raise # 处理其他异常 error_msg = f"获取文档信息失败: {str(e)}" # 尝试从响应中获取更多错误信息 if 'resp_json' in locals(): error_msg += f"，API响应: {resp_json}" raise Exception(error_msg) def interrupt_convert(self, file_no): """中断转换""" data = { "appId": self.app_id, "timestamp": int(time.time()) } # 生成鉴权字符串 auth_str = self.generate_auth_str(data) data['authStr'] = auth_str # 替换URL中的占位符 url = self.interrupt_url.format(file_no=file_no) try: resp = self.session.put(url, data=data) resp_json = resp.json() print(f"中断转换返回值: {resp_json}") # 检查HTTP状态码 if resp.status_code != 200: error_msg = resp_json.get('msg', f"API返回错误状态码: {resp.status_code}") raise Exception(f"中断转换失败: {error_msg}") # 检查业务逻辑错误 code = resp_json.get('code', 500) if code != 200: error_msg = resp_json.get('msg', '未知错误') raise Exception(f"中断转换失败: {error_msg}") return True except Exception as e: # 如果是已捕获的异常，直接重新抛出 if isinstance(e, Exception) and "中断转换失败" in str(e): raise # 处理其他异常 error_msg = f"中断转换失败: {str(e)}" # 尝试从响应中获取更多错误信息 if 'resp_json' in locals(): error_msg += f"，API响应: {resp_json}" raise Exception(error_msg) def download_file(self, file_no, save_path): """下载文件""" params = { "type": 1, "appId": self.app_id, "timestamp": int(time.time()) } # 生成鉴权字符串 auth_str = self.generate_auth_str(params) params['authStr'] = auth_str # 替换URL中的占位符 url = self.download_url.format(file_no=file_no) try: with self.session.get(url, params=params, stream=True) as resp: total_size = int(resp.headers.get('content-length', 0)) # 获取文件名（如果响应头中有） file_name = None content_disposition = resp.headers.get('Content-Disposition') if content_disposition: try: file_name = content_disposition.split('=')[1] # 移除可能的引号 if file_name.startswith('"') and file_name.endswith('"'): file_name = file_name[1:-1] except: pass # 如果没有从响应头获取到文件名，使用默认名称 if not file_name: file_name = f"parsed_{file_no}.md" save_path = os.path.join(os.path.dirname(save_path), file_name) with open(save_path, 'wb') as f, tqdm( desc="下载解析结果", total=total_size, unit='B', unit_scale=True, unit_divisor=1024 ) as bar: for chunk in resp.iter_content(chunk_size=8192): f.write(chunk) bar.update(len(chunk)) return save_path except Exception as e: raise Exception(f"文件下载失败: {str(e)}") def wait_for_completion(self, file_no, interval=1, timeout=3600) -> dict: """等待转换完成""" start_time = time.time() last_progress = 0 with tqdm(desc="文档解析进度", unit="%") as pbar: while True: status_data = self.get_document_info(file_no) # 获取状态和进度信息 convertStatus = status_data.get('convertStatus', 200) # 根据状态码估算进度 progress_map = { 200: 0, # 等待中 201: 50, # 处理中 202: 100 } current_progress = progress_map.get(convertStatus, last_progress) if current_progress > last_progress: pbar.update(current_progress - last_progress) last_progress = current_progress # 判断是否完成 # 状态码202表示转换完成 if convertStatus == 202: pbar.update(100 - last_progress) return status_data elif convertStatus == 204: # 处理失败 error_msg = status_data.get('errorMsg', '处理失败') raise Exception(f"解析失败: {error_msg}") elif convertStatus == 106: # 已取消 raise Exception("解析任务已取消") elif time.time() - start_time > timeout: raise TimeoutError(f"解析超时（{timeout}秒）") time.sleep(interval) def call_document_convert_api(file) -> str: """调用文档转换API获取解析后的文本（主要是Markdown）""" api_key = os.getenv("NIUTRANS_API_KEY") app_id = os.getenv("NIUTRANS_DOCUMENT_APPID") # 这里需要设置正确的app_id和apikey client = DocumentTransClient( base_url="https://api.niutrans.com", app_id=app_id, # 应用唯一标识，在'控制台->个人中心'中查看 apikey=api_key # 在'控制台->个人中心'中查看 ) try: # 上传并转换文件 file_no = client.upload_and_convert( file=file.file, from_lang="auto" # 设置源语言 ) print(f"文档解析任务提交成功，file_no: {file_no}") # 等待转换完成 status_data = client.wait_for_completion(file_no) # 下载转换后的MD文件并直接读取内容 with tempfile.TemporaryDirectory() as temp_dir: downloaded_file_path = os.path.join(temp_dir, f"parsed_{file_no}.md") client.download_file(file_no, downloaded_file_path) # 直接读取MD文件内容 with open(downloaded_file_path, 'r', encoding='utf-8') as f: text_content = f.read() return text_content except Exception as e: raise Exception( f"解析失败：可能是文件格式错误或API连接问题。" f"原始错误：{str(e)}" ) def preprocess_raw_text(raw_text: str) -> str: """简单预处理Markdown文本（去除乱码和多余空行）""" lines = [line.strip() for line in raw_text.splitlines() if line.strip()] cleaned_text = "\n".join(lines) cleaned_text = cleaned_text.replace("\\u0000", "").replace("�", "").replace("\r", "") return cleaned_text def process_document_content(text_content: str) -> str: try: # 预处理文本 cleaned_content = preprocess_raw_text(text_content) return cleaned_content except Exception as e: raise Exception(f"文档处理失败: {str(e)}") @mcp.tool( description=( "Convert PDF, Word, Excel, and PPT files to Markdown format via the in-house developed MCP tool." "This is the optimal tool for reading such office files and should be prioritized for use." "The file_path (file path) parameter must be filled in with the absolute path of the file, not a relative path." "Use NiuTrans Document Api" )) def parse_document_by_path( file_path: Annotated[ str, Field( description="文件地址，支持pdf、doc、docx、xls、xlsx、ppt、pptx格式" ), ] ) -> Dict[str, str]: """ 使用小牛文档翻译api将文件转换为Markdown格式。处理完成后，会返回成功的Markdown格式文本内容。 Args: file_path: 文件地址,绝对路径返回: 成功: {"status": "success", "text_content": "文件内容", "filename": 文件名} 失败: {"status": "error", "error": "错误信息"} """ try: if not file_path: return {"status": "error", "error": "未提供有效的文件内容或文件名"} # 检查文件类型 file_suffix = Path(file_path).suffix.lower() # 同时支持带点和不带点的后缀格式 supported_suffixes = [".pdf", ".doc", ".docx", ".xls", ".xlsx", ".ppt", ".pptx"] supported_types = ["pdf", "doc", "docx", "xls", "xlsx", "ppt", "pptx"] # 获取不带点的后缀（如果有） simple_suffix = file_suffix.lstrip('.') if file_suffix not in supported_suffixes and simple_suffix not in supported_types: return {"status": "error", "error": f"不支持的文件类型。请上传以下格式的文件: {', '.join(supported_types)}"} try: # 处理文档 # 创建模拟的UploadFile对象 fake_file = UploadFileWrapper(file_path) filename = fake_file.filename text_content = call_document_convert_api(fake_file) # 处理文本内容 process_result = process_document_content(text_content) return { "text_content": process_result, "filename": filename, "status": "success" } except Exception as e: return {"status": "error", "error": f"解析失败：{str(e)}"} except Exception as e: return {"status": "error", "error": f"解析失败：{str(e)}"} @mcp.resource("document://supported-types") def get_supported_file_types() -> Dict[str, list]: return { "supported_types": [ {"format": "PDF", "extensions": [".pdf"], "mime_type": "application/pdf"}, {"format": "Word", "extensions": [".doc", ".docx"], "mime_type": ["application/msword", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]}, {"format": "Excel", "extensions": [".xls", ".xlsx"], "mime_type": ["application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]}, {"format": "PPT", "extensions": [".ppt", ".pptx"], "mime_type": ["application/vnd.ms-powerpoint", "application/vnd.openxmlformats-officedocument.presentationml.presentation"]} ], "description": "支持解析文档并返回提取的Markdown格式内容" } def main(): """MCP工具主入口点""" # 直接启动MCP服务器，使用默认配置 mcp.run(transport="stdio") # 确保MCP实例被正确导出，便于被其他模块导入和使用 __all__ = ['mcp', 'parse_document_by_path', 'get_supported_file_types', 'main'] if __name__ == '__main__': main()

Implementation Reference

parse_document_by_path

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/NiuTrans/MCP-DocumentParse'

If you have feedback or need assistance with the MCP directory API, please join our Discord server