Skip to main content
Glama
server.py16.5 kB
""" MCP Server for OfficeReader - Convert Office documents to Markdown. Supports: Word (docx/doc), Excel (xlsx/xls), PowerPoint (pptx/ppt) """ import asyncio import json import os import sys from pathlib import Path from typing import Any from mcp.server import Server from mcp.server.stdio import stdio_server from mcp.types import ( TextContent, Tool, INVALID_PARAMS, INTERNAL_ERROR, ) from .converter import OfficeConverter, SUPPORTED_FORMATS from .config_loader import get_config # Initialize the MCP server server = Server("officereader-mcp") # Global converter instance _converter: OfficeConverter | None = None def get_converter() -> OfficeConverter: """Get or create the converter instance.""" global _converter if _converter is None: # Load configuration config = get_config() cache_dir = config.get_cache_dir() _converter = OfficeConverter(cache_dir=cache_dir) return _converter def get_supported_extensions() -> list[str]: """Get list of all supported file extensions.""" extensions = [] for exts in SUPPORTED_FORMATS.values(): extensions.extend(exts) return extensions @server.list_tools() async def list_tools() -> list[Tool]: """List available tools for the MCP server.""" supported_exts = ", ".join(get_supported_extensions()) return [ Tool( name="convert_document", description=f"""Convert Office documents to Markdown format for Claude to read. Supported formats: - Word: .docx, .doc - Excel: .xlsx, .xls (converts each sheet to a Markdown table) - PowerPoint: .pptx, .ppt (extracts text and images from slides) Features: - Text extraction with formatting (headings, bold, italic, lists, tables) - Image extraction and optimization (auto-compressed for efficiency) - Speaker notes extraction from PowerPoint - Multi-sheet support for Excel Images are automatically optimized (WebP format, max 1920x1080) to reduce size while maintaining readability for Claude.""", inputSchema={ "type": "object", "properties": { "file_path": { "type": "string", "description": f"Absolute path to the Office document. Supported: {supported_exts}", }, "extract_images": { "type": "boolean", "description": "Whether to extract and include images (default: true)", "default": True, }, "image_format": { "type": "string", "enum": ["file", "base64", "both"], "description": "How to handle images: 'file' saves to disk (recommended), 'base64' embeds in markdown, 'both' does both", "default": "file", }, "output_name": { "type": "string", "description": "Custom name for the output (without extension). If not provided, generates from filename.", }, }, "required": ["file_path"], }, ), Tool( name="read_converted_markdown", description="""Read the content of a previously converted markdown file. Use this after convert_document to get the actual markdown content. This is useful when you want to process or analyze the converted document.""", inputSchema={ "type": "object", "properties": { "markdown_path": { "type": "string", "description": "Path to the markdown file (returned by convert_document)", }, }, "required": ["markdown_path"], }, ), Tool( name="list_conversions", description="""List all cached document conversions. Shows all documents that have been converted, including their output paths and number of extracted images. Useful for finding previously converted files.""", inputSchema={ "type": "object", "properties": {}, }, ), Tool( name="clear_cache", description="""Clear all cached conversions. Removes all converted markdown files and extracted images from the cache. Use this to free up disk space or reset the conversion cache.""", inputSchema={ "type": "object", "properties": {}, }, ), Tool( name="get_document_metadata", description=f"""Get metadata from an Office document without full conversion. Extracts document properties like title, author, creation date, etc. Faster than full conversion when you only need metadata. Supported formats: {supported_exts}""", inputSchema={ "type": "object", "properties": { "file_path": { "type": "string", "description": f"Absolute path to the Office document. Supported: {supported_exts}", }, }, "required": ["file_path"], }, ), Tool( name="get_supported_formats", description="""Get list of all supported file formats. Returns a dictionary of file types (word, excel, powerpoint) and their extensions.""", inputSchema={ "type": "object", "properties": {}, }, ), ] def get_cache_location_notice(converter: OfficeConverter) -> str: """Generate a notice about cache location.""" return f"[Cache Location] {converter.cache_dir}" @server.call_tool() async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]: """Handle tool calls.""" try: converter = get_converter() cache_notice = get_cache_location_notice(converter) if name == "convert_document": file_path = arguments.get("file_path") if not file_path: return [TextContent( type="text", text=json.dumps({"error": "file_path is required"}, ensure_ascii=False) )] extract_images = arguments.get("extract_images", True) image_format = arguments.get("image_format", "file") output_name = arguments.get("output_name") result = converter.convert_file( file_path=file_path, extract_images=extract_images, image_format=image_format, output_name=output_name, ) # Return summary with markdown preview response = { "status": "success", "file_type": result.get("file_type", "unknown"), "cache_location": str(converter.cache_dir), "output_path": result["output_path"], "conversion_dir": result["conversion_dir"], "images_extracted": len(result["images"]), "image_paths": result["images"], "metadata": result["metadata"], "markdown_preview": result["markdown"][:2000] + "..." if len(result["markdown"]) > 2000 else result["markdown"], "markdown_length": len(result["markdown"]), "hint": "Use 'list_conversions' to see all cached files, or 'read_converted_markdown' to read the full content.", } return [TextContent( type="text", text=f"{cache_notice}\n\n" + json.dumps(response, ensure_ascii=False, indent=2) )] elif name == "read_converted_markdown": markdown_path = arguments.get("markdown_path") if not markdown_path: return [TextContent( type="text", text=f"{cache_notice}\n\n" + json.dumps({"error": "markdown_path is required"}, ensure_ascii=False) )] path = Path(markdown_path) if not path.exists(): return [TextContent( type="text", text=f"{cache_notice}\n\n" + json.dumps({"error": f"File not found: {markdown_path}"}, ensure_ascii=False) )] with open(path, "r", encoding="utf-8") as f: content = f.read() return [TextContent( type="text", text=f"{cache_notice}\n[Reading] {markdown_path}\n[Length] {len(content)} characters\n\n---\n\n{content}" )] elif name == "list_conversions": cache_info = converter.get_cache_info() # Build a formatted summary summary_lines = [ "=" * 50, " OfficeReader-MCP Cache Information", "=" * 50, "", cache_notice, f"Output directory: {cache_info.get('output_dir', 'N/A')}", "", f"Total cached conversions: {cache_info['total_conversions']}", f"Total cache size: {cache_info.get('total_size_human', 'N/A')}", "", ] if cache_info['conversions']: summary_lines.append("-" * 50) summary_lines.append(" Cached Documents") summary_lines.append("-" * 50) for i, conv in enumerate(cache_info['conversions'], 1): summary_lines.append(f"\n[{i}] {conv['name']}") summary_lines.append(f" Directory: {conv['path']}") if conv['markdown_files']: summary_lines.append(f" Markdown: {conv['markdown_files'][0]}") summary_lines.append(f" Images: {conv['image_count']} files") summary_lines.append(f" Size: {conv.get('size_human', 'N/A')}") if conv.get('modified'): summary_lines.append(f" Modified: {conv['modified']}") else: summary_lines.append("-" * 50) summary_lines.append("No cached conversions found.") summary_lines.append("") summary_lines.append("To convert a document, use the 'convert_document' tool:") summary_lines.append(" file_path: <path to your .docx or .doc file>") summary_lines.append("") summary_lines.append("=" * 50) summary_lines.append("\n--- Raw JSON Data ---") return [TextContent( type="text", text="\n".join(summary_lines) + "\n" + json.dumps(cache_info, ensure_ascii=False, indent=2) )] elif name == "clear_cache": result = converter.clear_cache() response = { "status": "success", "cache_location": str(converter.cache_dir), "cleared_count": result["count"], "cleared_directories": result["cleared"], } return [TextContent( type="text", text=f"{cache_notice}\n\nCache cleared successfully!\n\n" + json.dumps(response, ensure_ascii=False, indent=2) )] elif name == "get_document_metadata": file_path = arguments.get("file_path") if not file_path: return [TextContent( type="text", text=f"{cache_notice}\n\n" + json.dumps({"error": "file_path is required"}, ensure_ascii=False) )] from .converter import get_file_type file_path_obj = Path(file_path) file_type = get_file_type(file_path_obj) metadata = { "file": file_path, "file_type": file_type, "cache_location": str(converter.cache_dir), } if file_type == "word": from docx import Document doc = Document(file_path) core_props = doc.core_properties metadata.update({ "title": core_props.title or "", "author": core_props.author or "", "created": str(core_props.created) if core_props.created else "", "modified": str(core_props.modified) if core_props.modified else "", "last_modified_by": core_props.last_modified_by or "", "subject": core_props.subject or "", "keywords": core_props.keywords or "", "category": core_props.category or "", "comments": core_props.comments or "", "revision": core_props.revision, }) elif file_type == "excel": from openpyxl import load_workbook wb = load_workbook(file_path, data_only=True) props = wb.properties metadata.update({ "title": props.title or "", "creator": props.creator or "", "created": str(props.created) if props.created else "", "modified": str(props.modified) if props.modified else "", "sheet_count": len(wb.sheetnames), "sheet_names": wb.sheetnames, }) elif file_type == "powerpoint": from pptx import Presentation prs = Presentation(file_path) core_props = prs.core_properties metadata.update({ "title": core_props.title or "", "author": core_props.author or "", "created": str(core_props.created) if core_props.created else "", "modified": str(core_props.modified) if core_props.modified else "", "subject": core_props.subject or "", "slide_count": len(prs.slides), }) else: return [TextContent( type="text", text=f"{cache_notice}\n\n" + json.dumps({ "error": f"Unsupported file format: {file_path_obj.suffix}", "supported": get_supported_extensions() }, ensure_ascii=False) )] return [TextContent( type="text", text=f"{cache_notice}\n\n" + json.dumps(metadata, ensure_ascii=False, indent=2) )] elif name == "get_supported_formats": return [TextContent( type="text", text=json.dumps({ "supported_formats": SUPPORTED_FORMATS, "all_extensions": get_supported_extensions(), }, ensure_ascii=False, indent=2) )] else: return [TextContent( type="text", text=f"{cache_notice}\n\n" + json.dumps({"error": f"Unknown tool: {name}"}, ensure_ascii=False) )] except FileNotFoundError as e: converter = get_converter() return [TextContent( type="text", text=f"[Cache Location] {converter.cache_dir}\n\n" + json.dumps({"error": str(e)}, ensure_ascii=False) )] except ValueError as e: converter = get_converter() return [TextContent( type="text", text=f"[Cache Location] {converter.cache_dir}\n\n" + json.dumps({"error": str(e)}, ensure_ascii=False) )] except Exception as e: converter = get_converter() return [TextContent( type="text", text=f"[Cache Location] {converter.cache_dir}\n\n" + json.dumps({ "error": f"Conversion failed: {str(e)}", "error_type": type(e).__name__, }, ensure_ascii=False) )] def main(): """Main entry point for the MCP server.""" async def run(): async with stdio_server() as (read_stream, write_stream): await server.run( read_stream, write_stream, server.create_initialization_options(), ) asyncio.run(run()) if __name__ == "__main__": main()

Implementation Reference

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Asunainlove/OfficeReader-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server