Skip to main content
Glama

Office Word MCP Server

by franlealp1
markdown_tools.py23.5 kB
""" Markdown tools for Word Document Server. These tools handle formatting operations for Word documents and converting them to Markdown format and the opposite. """ import os import re import html from typing import Optional from docx import Document from docx.shared import RGBColor def convert_document_to_markdown(doc_path: str, output_path: Optional[str] = None, use_pandoc: bool = True) -> str: """ Convert a Word document to Markdown format. Args: doc_path: Path to the Word document output_path: Optional path for output file. If None, returns markdown as string use_pandoc: Whether to use pypandoc (preferred) or fallback to custom implementation Returns: Success message if output_path provided, or markdown content as string """ if not os.path.exists(doc_path): return f"Document {doc_path} does not exist" try: if use_pandoc: # Try using pypandoc first (most comprehensive) try: import pypandoc except ImportError: # Pypandoc not available, fall back to custom implementation pass except Exception as e: # Pypandoc failed, fall back to custom implementation print(f"Pypandoc failed: {e}, falling back to custom implementation") # Custom implementation using python-docx markdown_content = _convert_docx_to_markdown_custom(doc_path) if output_path: with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_content) return f"Successfully converted {doc_path} to {output_path} using custom implementation" else: return markdown_content except Exception as e: return f"Failed to convert document to markdown: {str(e)}" def _convert_docx_to_markdown_custom(doc_path: str) -> str: """ Custom implementation to convert DOCX to Markdown using python-docx. Handles paragraphs, headings, tables, lists, and basic formatting. """ doc = Document(doc_path) markdown_lines = [] # Process document elements in order for element in doc.element.body: if element.tag.endswith('}p'): # Paragraph para = None # Find the paragraph object for p in doc.paragraphs: if p._element == element: para = p break if para: md_line = _convert_paragraph_to_markdown(para) if md_line.strip(): markdown_lines.append(md_line) elif element.tag.endswith('}tbl'): # Table table = None # Find the table object for t in doc.tables: if t._element == element: table = t break if table: md_table = _convert_table_to_markdown(table) if md_table.strip(): markdown_lines.append(md_table) markdown_lines.append("") # Add spacing after table return "\n".join(markdown_lines).strip() def _convert_paragraph_to_markdown(paragraph) -> str: """Convert a paragraph to markdown format.""" text = paragraph.text.strip() if not text: return "" # Check if it's a heading style_name = paragraph.style.name.lower() if paragraph.style else "" if "heading" in style_name or "título" in style_name: # Extract heading level level = 1 if "1" in style_name: level = 1 elif "2" in style_name: level = 2 elif "3" in style_name: level = 3 elif "4" in style_name: level = 4 elif "5" in style_name: level = 5 elif "6" in style_name: level = 6 return f"{'#' * level} {text}" # Check for list items if hasattr(paragraph, '_element') and paragraph._element.pPr is not None: numPr = paragraph._element.pPr.numPr if numPr is not None: # This is a list item ilvl = numPr.ilvl level = int(ilvl.val) if ilvl is not None else 0 indent = " " * level return f"{indent}- {text}" # Process inline formatting formatted_text = _process_inline_formatting(paragraph) return formatted_text def _process_inline_formatting(paragraph) -> str: """Process inline formatting like bold, italic, underline and color.""" result = [] for run in paragraph.runs: text = run.text if not text: continue # Escape HTML special characters escaped = html.escape(text) # Apply markdown formatting for bold/italic, and HTML for underline # Keep markers inside the color span (so color is outer wrapper) # Start with the plain escaped text run_text = escaped # Italic then bold ordering - preserve both (Markdown allows nesting) if run.italic: run_text = f"*{run_text}*" if run.bold: run_text = f"**{run_text}**" if run.underline: # Markdown has no standard underline; use HTML <u> run_text = f"<u>{run_text}</u>" # Detect explicit RGB color and wrap in an HTML span if present color_hex = _get_run_color_hex(run) if color_hex: run_text = f'<span style="color:{color_hex}">{run_text}</span>' result.append(run_text) return "".join(result) def _convert_table_to_markdown(table) -> str: """Convert a table to markdown format.""" if not table.rows: return "" markdown_rows = [] # Process header row header_row = table.rows[0] header_cells = [] for cell in header_row.cells: cell_text = cell.text.strip().replace('\n', ' ') header_cells.append(cell_text) markdown_rows.append("| " + " | ".join(header_cells) + " |") # Add separator row separator = "| " + " | ".join(["---"] * len(header_cells)) + " |" markdown_rows.append(separator) # Process data rows for row in table.rows[1:]: row_cells = [] for cell in row.cells: cell_text = cell.text.strip().replace('\n', ' ') row_cells.append(cell_text) # Ensure row has same number of cells as header while len(row_cells) < len(header_cells): row_cells.append("") markdown_rows.append("| " + " | ".join(row_cells) + " |") return "\n".join(markdown_rows) def get_document_markdown_preview(doc_path: str, max_length: int = 1000) -> str: """ Get a preview of the document as markdown (first max_length characters). Args: doc_path: Path to the Word document max_length: Maximum length of preview Returns: Markdown preview of the document """ try: markdown_content = convert_document_to_markdown(doc_path, output_path=None) if isinstance(markdown_content, str) and not markdown_content.startswith("Failed"): if len(markdown_content) > max_length: return markdown_content[:max_length] + "\n\n... (truncated)" return markdown_content else: return f"Failed to generate preview: {markdown_content}" except Exception as e: return f"Failed to generate preview: {str(e)}" def _get_run_color_hex(run) -> Optional[str]: """ Return a CSS hex color string for the run's font color (e.g. '#RRGGBB'), or None if no explicit RGB color is set. """ try: color = getattr(run.font, "color", None) if not color: return None rgb = getattr(color, "rgb", None) if rgb: hexstr = str(rgb) hexstr = hexstr.strip() if len(hexstr) == 6: return "#" + hexstr if hexstr.startswith("#") and len(hexstr) == 7: return hexstr except Exception: return None return None def _hex_to_rgb_tuple(hex_color: str) -> tuple[int, int, int]: """Return (r,g,b) from '#RRGGBB' or 'RRGGBB'.""" h = hex_color.lstrip('#') if len(h) != 6: raise ValueError("Invalid hex color") return int(h[0:2], 16), int(h[2:4], 16), int(h[4:6], 16) def _create_run_with_formatting(paragraph, text: str, color_hex: Optional[str] = None, bold: bool = False, italic: bool = False, underline: bool = False): """Create a run on paragraph with the given formatting and color (if provided).""" from html import unescape run = paragraph.add_run(unescape(text)) run.bold = bool(bold) run.italic = bool(italic) if underline: run.underline = True if color_hex: try: r, g, b = _hex_to_rgb_tuple(color_hex) run.font.color.rgb = RGBColor(r, g, b) except Exception: # ignore color failures silently (fallback to no color) pass return run def _create_runs_from_markdown_text(paragraph, text: str, color_hex: Optional[str] = None): """ Recursively parse simple markdown inline formatting in 'text' and create runs on paragraph. Supports: - <span style="color:#RRGGBB">...</span> (handled above the call in outer function) - <u>...</u> - **bold** - *italic* This is a pragmatic parser (handles nested bold/italic reasonably). """ # Handle <u> tags first u_re = re.compile(r'<u>(.*?)</u>', flags=re.S) pos = 0 for m in u_re.finditer(text): before = text[pos:m.start()] if before: _create_runs_from_markdown_text(paragraph, before, color_hex) inner = m.group(1) # inner may contain bold/italic so recursively parse with underline flag via wrapper _create_runs_from_markdown_text_with_flags(paragraph, inner, color_hex, underline=True) pos = m.end() remaining = text[pos:] if pos == 0: # no <u> _create_runs_from_markdown_text_with_flags(paragraph, remaining, color_hex, underline=False) def _create_runs_from_markdown_text_with_flags(paragraph, text: str, color_hex: Optional[str], underline: bool = False): """ Parse bold/italic markers in text and create runs. Called recursively. Priority: bold (**...**) then italic (*...*). Markers removed when creating runs. """ # Bold bold_re = re.compile(r'\*\*(.+?)\*\*', flags=re.S) m = bold_re.search(text) if m: before = text[:m.start()] if before: _create_runs_from_markdown_text_with_flags(paragraph, before, color_hex, underline) inner = m.group(1) _create_runs_from_markdown_text_with_flags(paragraph, inner, color_hex, underline=True) after = text[m.end():] if after: _create_runs_from_markdown_text_with_flags(paragraph, after, color_hex, underline) return # Italic italic_re = re.compile(r'\*(.+?)\*', flags=re.S) m2 = italic_re.search(text) if m2: before = text[:m2.start()] if before: _create_runs_from_markdown_text_with_flags(paragraph, before, color_hex, underline) inner = m2.group(1) # Create inner run with italic=True # But inner may contain nested formatting - recursively parse but force italic _create_runs_from_markdown_text_force_flags(paragraph, inner, color_hex, bold=False, italic=True, underline=underline) after = text[m2.end():] if after: _create_runs_from_markdown_text_with_flags(paragraph, after, color_hex, underline) return # No bold/italic left: create a single run _create_runs_from_markdown_text_force_flags(paragraph, text, color_hex, bold=False, italic=False, underline=underline) def _create_runs_from_markdown_text_force_flags(paragraph, text: str, color_hex: Optional[str], bold: bool, italic: bool, underline: bool): """ Create runs for plain text while applying force flags. This does not attempt further markdown parsing inside text (used when we've forced bold/italic state). """ if not text: return # Text may still contain HTML spans for color (rare in this call), but if we encounter them, handle: span_re = re.compile(r'<span\s+style=["\']\s*color\s*:\s*(#?[0-9A-Fa-f]{6})\s*["\']\s*>(.*?)</span>', flags=re.S) pos = 0 for m in span_re.finditer(text): before = text[pos:m.start()] if before: _create_run_with_formatting(paragraph, before, color_hex, bold=bold, italic=italic, underline=underline) span_color = m.group(1) inner = m.group(2) # create runs for inner text with span color while preserving forced flags _create_runs_from_markdown_text_force_flags(paragraph, inner, span_color if span_color else color_hex, bold=bold, italic=italic, underline=underline) pos = m.end() remainder = text[pos:] if remainder: _create_run_with_formatting(paragraph, remainder, color_hex, bold=bold, italic=italic, underline=underline) def _parse_markdown_blocks(md_text: str) -> list: """ Parse the markdown into a list of blocks. Each block is a dict: - {"type": "paragraph", "lines": [...]} or - {"type": "table", "lines": [...]}. Table detection: lines starting and containing '|' + a separator line with --- in second row. """ lines = md_text.splitlines() blocks = [] i = 0 while i < len(lines): line = lines[i] if line.strip() == "": # skip blank lines i += 1 continue # Detect table start: line contains '|' and next line appears to be separator (---) if '|' in line: # Look ahead for separator if i + 1 < len(lines) and re.match(r'^\s*\|?\s*[:\-]+\s*(\|\s*[:\-]+\s*)+\|?\s*$', lines[i+1]): # crude separator detection # Collect table lines (header + separator + data rows) tbl_lines = [line, lines[i+1]] j = i + 2 while j < len(lines) and ('|' in lines[j] or lines[j].strip() == ""): # stop at blank line after table (blank allowed inside but treat contiguous) if lines[j].strip() == "": break tbl_lines.append(lines[j]) j += 1 blocks.append({"type": "table", "lines": tbl_lines}) i = j continue # Otherwise, collect paragraph (or list) lines until blank or table para_lines = [line] j = i + 1 while j < len(lines) and lines[j].strip() != "" and '|' not in lines[j]: para_lines.append(lines[j]) j += 1 blocks.append({"type": "paragraph", "lines": para_lines}) i = j return blocks def insert_markdown_block_into_document(md_block: str, source_doc_path: str, position: Optional[int] = None) -> str: """ Convert the provided markdown block into Word paragraphs and tables and insert it into the DOCX. Args: md_block: Markdown text containing paragraphs and/or tables. source_doc_path: Path to the Word document to modify. position: Optional paragraph index after which to insert. If None or invalid, appends at the end. Returns: Status string describing what was inserted (counts) or an error message. """ import os from docx import Document if not os.path.exists(source_doc_path): return f"Document '{source_doc_path}' does not exist." try: doc = Document(source_doc_path) blocks = _parse_markdown_blocks(md_block) # We'll create elements (they will initially be appended to the document) created_elements = [] # list of tuples ('p', paragraph_obj) or ('tbl', table_obj) for block in blocks: if block["type"] == "paragraph": # For multi-line paragraph block, join lines with single spaces (preserve paragraphs) for line in block["lines"]: text = line.rstrip() # Detect simple bullet list item if re.match(r'^\s*-\s+', text): try: p = doc.add_paragraph('', style='List Bullet') except Exception: p = doc.add_paragraph('') p.add_run('- ') # strip leading '- ' content = re.sub(r'^\s*-\s+', '', text) # Handle color spans and inline formatting # First, handle span tags outside run parsing: split on span span_re = re.compile(r'<span\s+style=["\']\s*color\s*:\s*(#?[0-9A-Fa-f]{6})\s*["\']\s*>(.*?)</span>', flags=re.S) pos = 0 for m in span_re.finditer(content): before = content[pos:m.start()] if before: _create_runs_from_markdown_text(p, before, None) span_color = m.group(1) inner = m.group(2) _create_runs_from_markdown_text(p, inner, span_color) pos = m.end() remainder = content[pos:] if remainder: _create_runs_from_markdown_text(p, remainder, None) created_elements.append(('p', p)) else: p = doc.add_paragraph('') # parse for span tags at top-level and pass inner pieces to markdown run creator span_re = re.compile(r'<span\s+style=["\']\s*color\s*:\s*(#?[0-9A-Fa-f]{6})\s*["\']\s*>(.*?)</span>', flags=re.S) pos = 0 for m in span_re.finditer(text): before = text[pos:m.start()] if before: _create_runs_from_markdown_text(p, before, None) span_color = m.group(1) inner = m.group(2) _create_runs_from_markdown_text(p, inner, span_color) pos = m.end() remainder = text[pos:] if remainder: _create_runs_from_markdown_text(p, remainder, None) created_elements.append(('p', p)) elif block["type"] == "table": # Build table rows: split each line by '|' and strip spaces, ignore empty leading/trailing cells rows = [] for ln in block["lines"]: # skip separator row containing dashes if re.match(r'^\s*\|?\s*[:\-]+\s*(\|\s*[:\-]+\s*)+\|?\s*$', ln): continue # split and trim; keep empty cells if present parts = [cell.strip() for cell in re.split(r'\|', ln)] # remove leading/trailing empty if the row started/ended with '|' if parts and parts[0] == '': parts = parts[1:] if parts and parts[-1] == '': parts = parts[:-1] rows.append(parts) if not rows: continue num_cols = max(len(r) for r in rows) table = doc.add_table(rows=len(rows), cols=num_cols) table.style = 'Table Grid' if 'Table Grid' in doc.styles else table.style for r_idx, row in enumerate(rows): for c_idx in range(num_cols): val = row[c_idx] if c_idx < len(row) else "" cell = table.cell(r_idx, c_idx) # Clear default paragraph(s) cell.text = "" # reset # Add content with inline formatting parsing p = cell.paragraphs[0] # same span parsing as paragraphs span_re = re.compile(r'<span\s+style=["\']\s*color\s*:\s*(#?[0-9A-Fa-f]{6})\s*["\']\s*>(.*?)</span>', flags=re.S) pos = 0 for m in span_re.finditer(val): before = val[pos:m.start()] if before: _create_runs_from_markdown_text(p, before, None) span_color = m.group(1) inner = m.group(2) _create_runs_from_markdown_text(p, inner, span_color) pos = m.end() remainder = val[pos:] if remainder: _create_runs_from_markdown_text(p, remainder, None) created_elements.append(('tbl', table)) else: # unknown block type -> ignore continue # Insert created elements at desired position # If position is invalid or None, we keep appended elements (they are already appended) insert_at_end = False if position is None: insert_at_end = True else: try: pos_idx = int(position) if pos_idx < 0 or pos_idx >= len(doc.paragraphs): insert_at_end = True else: insert_at_end = False except Exception: insert_at_end = True if not insert_at_end: # Move each created element to be after the anchor paragraph, in order anchor_para = doc.paragraphs[pos_idx] for kind, elem in created_elements: if kind == 'p': anchor_para._element.addnext(elem._element) anchor_para = elem elif kind == 'tbl': # insert table element after anchor anchor_para._element.addnext(elem._tbl) # create a separator paragraph after table to serve as anchor for next insertions sep = doc.add_paragraph('') elem._tbl.addnext(sep._p) if hasattr(elem._tbl, 'addnext') else None # fallback anchor: use sep anchor_para = sep # else: keep appended elements (already at end) # Save document doc.save(source_doc_path) # Build counts for report p_count = sum(1 for k, _ in created_elements if k == 'p') t_count = sum(1 for k, _ in created_elements if k == 'tbl') where = "end of document" if insert_at_end else f"after paragraph index {pos_idx}" return f"Inserted {p_count} paragraph(s) and {t_count} table(s) into '{source_doc_path}' at {where}." except Exception as e: return f"Failed to insert markdown block into document: {str(e)}"

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/franlealp1/mcp-word'

If you have feedback or need assistance with the MCP directory API, please join our Discord server