Skip to main content
Glama

TAPD Data Fetcher

docx_summarizer.py3.92 kB
""" docx_summarizer.py 功能: - 输入 .docx 文档路径,自动提取正文内容。 - 生成简要摘要(如截取前N段)。 - 作为 MCP 工具注册,便于 AI 自动调用。 依赖:python-docx """ from typing import Optional from docx import Document import os from mcp.server.fastmcp import FastMCP import json import shutil import uuid import csv from .common_utils import get_file_manager mcp = FastMCP("docx") @mcp.tool() def summarize_docx(docx_path: str, max_paragraphs: int = 5) -> str: """ 读取 docx 文档并生成摘要和完整内容的 JSON 数据,同时提取图片和表格 参数: docx_path (str): .docx 文件路径 max_paragraphs (int): 摘要最多包含的段落数,默认5 返回: str: JSON 字符串,包含所有段落、摘要、图片和表格信息 """ if not os.path.exists(docx_path): return json.dumps({"status": "error", "message": f"文件不存在: {docx_path}"}, ensure_ascii=False) try: doc = Document(docx_path) paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()] para_list = [ {"index": i+1, "text": text} for i, text in enumerate(paragraphs) ] summary = "\n".join(paragraphs[:max_paragraphs]) if paragraphs else "文档无有效正文内容。" # 1. 提取图片 # 获取项目根目录路径 current_dir = os.getcwd() pictures_dir = os.path.join(current_dir, 'documents_data', 'pictures_data') os.makedirs(pictures_dir, exist_ok=True) picture_files = [] rels = doc.part.rels for rel in rels: rel_obj = rels[rel] if rel_obj.reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image': img = rel_obj.target_part img_data = img.blob ext = os.path.splitext(img.partname)[1] img_name = f"{os.path.splitext(os.path.basename(docx_path))[0]}_{uuid.uuid4().hex[:8]}{ext}" img_path = os.path.join(pictures_dir, img_name) with open(img_path, 'wb') as f: f.write(img_data) picture_files.append(img_name) # 2. 提取表格为 CSV tables_dir = os.path.join(current_dir, 'documents_data', 'excel_data') os.makedirs(tables_dir, exist_ok=True) table_files = [] for idx, table in enumerate(doc.tables): csv_name = f"{os.path.splitext(os.path.basename(docx_path))[0]}_table_{idx+1}.csv" csv_path = os.path.join(tables_dir, csv_name) with open(csv_path, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) for row in table.rows: writer.writerow([cell.text.strip() for cell in row.cells]) table_files.append(csv_name) result = { "status": "success", "summary": summary, "paragraphs": para_list, "total_paragraphs": len(para_list), "pictures": picture_files, "tables": table_files } return json.dumps(result, ensure_ascii=False, indent=2) except Exception as e: return json.dumps({"status": "error", "message": f"解析文档失败: {str(e)}"}, ensure_ascii=False) if __name__ == "__main__": # 示例用法 test_path = "documents_data/docx_data/2.docx" result_json = summarize_docx(test_path) print(result_json) # 使用统一的FileManager输出到主文件夹 file_manager = get_file_manager() result_data = json.loads(result_json) file_manager.save_json_data(result_data, "docx_summary.json") print("已将文本摘要输出到主文件夹,图片信息输出到documents_data/pictures_data文件夹,表格信息输出到documents_data/excel_data文件夹。")

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/OneCuriousLearner/MCPAgentRE'

If you have feedback or need assistance with the MCP directory API, please join our Discord server