Skip to main content
Glama

DOCX MCP Server

by zeph-gh
index.ts11.3 kB
#!/usr/bin/env node import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js' import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js' import { z } from 'zod' import * as fs from 'fs' import * as path from 'path' import { createRequire } from 'module' // Create require for CommonJS modules const require = createRequire(import.meta.url) const mammoth = require('mammoth') const server = new McpServer({ name: 'docx-format-server', version: '0.2.0', }) // Tool to extract text content from DOCX files server.tool( 'extract_text', 'Extract plain text content from a DOCX file', { file_path: z.string().describe('Path to the .docx file'), }, async ({ file_path }) => { try { const absolutePath = path.resolve(file_path) if (!fs.existsSync(absolutePath)) { throw new Error(`File not found: ${absolutePath}`) } const result = await mammoth.extractRawText({ path: absolutePath }) return { content: [ { type: 'text', text: JSON.stringify( { text: result.value, messages: result.messages, word_count: result.value .split(/\s+/) .filter((word: string) => word.length > 0).length, }, null, 2 ), }, ], } } catch (error) { return { content: [ { type: 'text', text: `Error extracting text: ${(error as Error).message}`, }, ], isError: true, } } } ) // Tool to convert DOCX to HTML with formatting preserved server.tool( 'convert_to_html', 'Convert DOCX file to HTML with formatting preserved', { file_path: z.string().describe('Path to the .docx file'), include_styles: z .boolean() .optional() .describe('Include inline styles (default: true)'), }, async ({ file_path, include_styles = true }) => { try { const absolutePath = path.resolve(file_path) if (!fs.existsSync(absolutePath)) { throw new Error(`File not found: ${absolutePath}`) } const options = include_styles ? {} : { styleMap: [ "p[style-name='Heading 1'] => h1:fresh", "p[style-name='Heading 2'] => h2:fresh", "p[style-name='Heading 3'] => h3:fresh", "r[style-name='Strong'] => strong", "r[style-name='Emphasis'] => em", ], } const result = await mammoth.convertToHtml( { path: absolutePath }, options ) return { content: [ { type: 'text', text: JSON.stringify( { html: result.value, messages: result.messages, warnings: result.messages.filter( (m: any) => m.type === 'warning' ), errors: result.messages.filter((m: any) => m.type === 'error'), }, null, 2 ), }, ], } } catch (error) { return { content: [ { type: 'text', text: `Error converting to HTML: ${(error as Error).message}`, }, ], isError: true, } } } ) // Tool to analyze document structure and formatting server.tool( 'analyze_structure', 'Analyze document structure, headings, and formatting elements', { file_path: z.string().describe('Path to the .docx file'), }, async ({ file_path }) => { try { const absolutePath = path.resolve(file_path) if (!fs.existsSync(absolutePath)) { throw new Error(`File not found: ${absolutePath}`) } // Convert to HTML to analyze structure const htmlResult = await mammoth.convertToHtml({ path: absolutePath }) const html = htmlResult.value // Extract text for analysis const textResult = await mammoth.extractRawText({ path: absolutePath }) const text = textResult.value // Analyze structure const headings = (html.match(/<h[1-6][^>]*>.*?<\/h[1-6]>/gi) || []).map( (h: string) => ({ level: parseInt(h.match(/<h([1-6])/)![1]), text: h.replace(/<[^>]*>/g, '').trim(), }) ) const paragraphs = (html.match(/<p[^>]*>.*?<\/p>/gi) || []).length const strongElements = (html.match(/<strong[^>]*>.*?<\/strong>/gi) || []) .length const emElements = (html.match(/<em[^>]*>.*?<\/em>/gi) || []).length const lists = (html.match(/<[uo]l[^>]*>.*?<\/[uo]l>/gi) || []).length const listItems = (html.match(/<li[^>]*>.*?<\/li>/gi) || []).length const analysis = { document_stats: { total_characters: text.length, total_words: text .split(/\s+/) .filter((word: string) => word.length > 0).length, total_paragraphs: paragraphs, total_headings: headings.length, }, structure: { headings: headings, heading_levels: [ ...new Set(headings.map((h: any) => h.level)), ].sort(), }, formatting: { bold_elements: strongElements, italic_elements: emElements, lists: lists, list_items: listItems, }, messages: htmlResult.messages, } return { content: [ { type: 'text', text: JSON.stringify(analysis, null, 2), }, ], } } catch (error) { return { content: [ { type: 'text', text: `Error analyzing structure: ${(error as Error).message}`, }, ], isError: true, } } } ) // Tool to extract images from DOCX server.tool( 'extract_images', 'Extract and list images from a DOCX file', { file_path: z.string().describe('Path to the .docx file'), output_dir: z .string() .optional() .describe('Directory to save extracted images (optional)'), }, async ({ file_path, output_dir }) => { try { const absolutePath = path.resolve(file_path) if (!fs.existsSync(absolutePath)) { throw new Error(`File not found: ${absolutePath}`) } const options = { convertImage: mammoth.images.imgElement(function (image: any) { if (output_dir) { const outputPath = path.resolve(output_dir) if (!fs.existsSync(outputPath)) { fs.mkdirSync(outputPath, { recursive: true }) } const imagePath = path.join( outputPath, `image_${Date.now()}_${Math.random().toString(36).substr(2, 9)}.${ image.contentType.split('/')[1] }` ) return image.read().then(function (imageBuffer: Buffer) { fs.writeFileSync(imagePath, imageBuffer) return { src: imagePath, alt: image.altText || 'Extracted image', } }) } else { return image.read().then(function (imageBuffer: Buffer) { return { src: `data:${image.contentType};base64,${imageBuffer.toString( 'base64' )}`, alt: image.altText || 'Embedded image', size: imageBuffer.length, } }) } }), } const result = await mammoth.convertToHtml( { path: absolutePath }, options ) const images = (result.value.match(/<img[^>]*>/gi) || []).map( (img: string) => { const srcMatch = img.match(/src="([^"]*)"/) const altMatch = img.match(/alt="([^"]*)"/) return { src: srcMatch ? srcMatch[1] : '', alt: altMatch ? altMatch[1] : '', is_base64: srcMatch ? srcMatch[1].startsWith('data:') : false, } } ) return { content: [ { type: 'text', text: JSON.stringify( { total_images: images.length, images: images, output_directory: output_dir || 'Images embedded as base64', messages: result.messages, }, null, 2 ), }, ], } } catch (error) { return { content: [ { type: 'text', text: `Error extracting images: ${(error as Error).message}`, }, ], isError: true, } } } ) // Tool to convert DOCX to Markdown server.tool( 'convert_to_markdown', 'Convert DOCX file to Markdown format', { file_path: z.string().describe('Path to the .docx file'), }, async ({ file_path }) => { try { const absolutePath = path.resolve(file_path) if (!fs.existsSync(absolutePath)) { throw new Error(`File not found: ${absolutePath}`) } // Convert to HTML first const htmlResult = await mammoth.convertToHtml({ path: absolutePath }) let html = htmlResult.value // Simple HTML to Markdown conversion let markdown = html // Headers .replace(/<h1[^>]*>(.*?)<\/h1>/gi, '# $1\n\n') .replace(/<h2[^>]*>(.*?)<\/h2>/gi, '## $1\n\n') .replace(/<h3[^>]*>(.*?)<\/h3>/gi, '### $1\n\n') .replace(/<h4[^>]*>(.*?)<\/h4>/gi, '#### $1\n\n') .replace(/<h5[^>]*>(.*?)<\/h5>/gi, '##### $1\n\n') .replace(/<h6[^>]*>(.*?)<\/h6>/gi, '###### $1\n\n') // Bold and italic .replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**') .replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**') .replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*') .replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*') // Lists .replace(/<ul[^>]*>/gi, '') .replace(/<\/ul>/gi, '\n') .replace(/<ol[^>]*>/gi, '') .replace(/<\/ol>/gi, '\n') .replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n') // Paragraphs .replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n') // Line breaks .replace(/<br[^>]*>/gi, '\n') // Remove remaining HTML tags .replace(/<[^>]*>/g, '') // Clean up extra whitespace .replace(/\n{3,}/g, '\n\n') .trim() return { content: [ { type: 'text', text: JSON.stringify( { markdown: markdown, word_count: markdown .split(/\s+/) .filter((word: string) => word.length > 0).length, messages: htmlResult.messages, }, null, 2 ), }, ], } } catch (error) { return { content: [ { type: 'text', text: `Error converting to Markdown: ${(error as Error).message}`, }, ], isError: true, } } } ) const transport = new StdioServerTransport() await server.connect(transport) console.error('Advanced DOCX MCP server running on stdio')

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/zeph-gh/Docx-Mcp-Server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server