"""
Document utility functions for Word Document Server.
"""
import json
import html
import os
import re
from typing import Dict, List, Any, Optional
from docx import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.oxml.ns import qn
def get_document_properties(doc_path: str) -> Dict[str, Any]:
"""Get properties of a Word document."""
import os
if not os.path.exists(doc_path):
return {"error": f"Document {doc_path} does not exist"}
try:
doc = Document(doc_path)
core_props = doc.core_properties
return {
"title": core_props.title or "",
"author": core_props.author or "",
"subject": core_props.subject or "",
"keywords": core_props.keywords or "",
"created": str(core_props.created) if core_props.created else "",
"modified": str(core_props.modified) if core_props.modified else "",
"last_modified_by": core_props.last_modified_by or "",
"revision": core_props.revision or 0,
"page_count": len(doc.sections),
"word_count": sum(len(paragraph.text.split()) for paragraph in doc.paragraphs),
"paragraph_count": len(doc.paragraphs),
"table_count": len(doc.tables)
}
except Exception as e:
return {"error": f"Failed to get document properties: {str(e)}"}
def extract_document_text(doc_path: str) -> str:
"""Extract all text from a Word document."""
import os
if not os.path.exists(doc_path):
return f"Document {doc_path} does not exist"
try:
doc = Document(doc_path)
text = []
for paragraph in doc.paragraphs:
text.append(paragraph.text)
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
text.append(paragraph.text)
return "\n".join(text)
except Exception as e:
return f"Failed to extract text: {str(e)}"
def get_document_structure(doc_path: str) -> Dict[str, Any]:
"""Get the structure of a Word document."""
import os
if not os.path.exists(doc_path):
return {"error": f"Document {doc_path} does not exist"}
try:
doc = Document(doc_path)
structure = {
"paragraphs": [],
"tables": []
}
# Get paragraphs
for i, para in enumerate(doc.paragraphs):
structure["paragraphs"].append({
"index": i,
"text": para.text[:100] + ("..." if len(para.text) > 100 else ""),
"style": para.style.name if para.style else "Normal"
})
# Get tables
for i, table in enumerate(doc.tables):
table_data = {
"index": i,
"rows": len(table.rows),
"columns": len(table.columns),
"preview": []
}
# Get sample of table data
max_rows = min(3, len(table.rows))
for row_idx in range(max_rows):
row_data = []
max_cols = min(3, len(table.columns))
for col_idx in range(max_cols):
try:
cell_text = table.cell(row_idx, col_idx).text
row_data.append(cell_text[:20] + ("..." if len(cell_text) > 20 else ""))
except IndexError:
row_data.append("N/A")
table_data["preview"].append(row_data)
structure["tables"].append(table_data)
return structure
except Exception as e:
return {"error": f"Failed to get document structure: {str(e)}"}
def find_paragraph_by_text(doc, text, partial_match=False):
"""
Find paragraphs containing specific text.
Args:
doc: Document object
text: Text to search for
partial_match: If True, matches paragraphs containing the text; if False, matches exact text
Returns:
List of paragraph indices that match the criteria
"""
matching_paragraphs = []
for i, para in enumerate(doc.paragraphs):
if partial_match and text in para.text:
matching_paragraphs.append(i)
elif not partial_match and para.text == text:
matching_paragraphs.append(i)
return matching_paragraphs
def find_and_replace_text(doc, old_text, new_text):
"""
Find and replace text throughout the document, skipping Table of Contents (TOC) paragraphs.
Args:
doc: Document object
old_text: Text to find
new_text: Text to replace with
Returns:
Number of replacements made
"""
count = 0
# Search in paragraphs
for para in doc.paragraphs:
# Skip TOC paragraphs
if para.style and para.style.name.startswith("TOC"):
continue
if old_text in para.text:
for run in para.runs:
if old_text in run.text:
run.text = run.text.replace(old_text, new_text)
count += 1
# Search in tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
# Skip TOC paragraphs in tables
if para.style and para.style.name.startswith("TOC"):
continue
if old_text in para.text:
for run in para.runs:
if old_text in run.text:
run.text = run.text.replace(old_text, new_text)
count += 1
return count
def get_document_xml(doc_path: str) -> str:
"""Extract and return the raw XML structure of the Word document (word/document.xml)."""
import os
import zipfile
if not os.path.exists(doc_path):
return f"Document {doc_path} does not exist"
try:
with zipfile.ZipFile(doc_path) as docx_zip:
with docx_zip.open('word/document.xml') as xml_file:
return xml_file.read().decode('utf-8')
except Exception as e:
return f"Failed to extract XML: {str(e)}"
def insert_header_near_text(doc_path: str, target_text: str = None, header_title: str = "", position: str = 'after', header_style: str = 'Heading 1', target_paragraph_index: int = None) -> str:
"""Insert a header (with specified style) before or after the target paragraph. Specify by text or paragraph index. Skips TOC paragraphs in text search."""
import os
from docx import Document
if not os.path.exists(doc_path):
return f"Document {doc_path} does not exist"
try:
doc = Document(doc_path)
found = False
para = None
if target_paragraph_index is not None:
if target_paragraph_index < 0 or target_paragraph_index >= len(doc.paragraphs):
return f"Invalid target_paragraph_index: {target_paragraph_index}. Document has {len(doc.paragraphs)} paragraphs."
para = doc.paragraphs[target_paragraph_index]
found = True
else:
for i, p in enumerate(doc.paragraphs):
# Skip TOC paragraphs
if p.style and p.style.name.lower().startswith("toc"):
continue
if target_text and target_text in p.text:
para = p
found = True
break
if not found or para is None:
return f"Target paragraph not found (by index or text). (TOC paragraphs are skipped in text search)"
# Save anchor index before insertion
if target_paragraph_index is not None:
anchor_index = target_paragraph_index
else:
anchor_index = None
for i, p in enumerate(doc.paragraphs):
if p is para:
anchor_index = i
break
new_para = doc.add_paragraph(header_title, style=header_style)
if position == 'before':
para._element.addprevious(new_para._element)
else:
para._element.addnext(new_para._element)
doc.save(doc_path)
if anchor_index is not None:
return f"Header '{header_title}' (style: {header_style}) inserted {position} paragraph (index {anchor_index})."
else:
return f"Header '{header_title}' (style: {header_style}) inserted {position} the target paragraph."
except Exception as e:
return f"Failed to insert header: {str(e)}"
def insert_line_or_paragraph_near_text(doc_path: str, target_text: str = None, line_text: str = "", position: str = 'after', line_style: str = None, target_paragraph_index: int = None) -> str:
"""
Insert a new line or paragraph (with specified or matched style) before or after the target paragraph.
You can specify the target by text (first match) or by paragraph index.
Skips paragraphs whose style name starts with 'TOC' if using text search.
"""
import os
from docx import Document
if not os.path.exists(doc_path):
return f"Document {doc_path} does not exist"
try:
doc = Document(doc_path)
found = False
para = None
if target_paragraph_index is not None:
if target_paragraph_index < 0 or target_paragraph_index >= len(doc.paragraphs):
return f"Invalid target_paragraph_index: {target_paragraph_index}. Document has {len(doc.paragraphs)} paragraphs."
para = doc.paragraphs[target_paragraph_index]
found = True
else:
for i, p in enumerate(doc.paragraphs):
# Skip TOC paragraphs
if p.style and p.style.name.lower().startswith("toc"):
continue
if target_text and target_text in p.text:
para = p
found = True
break
if not found or para is None:
return f"Target paragraph not found (by index or text). (TOC paragraphs are skipped in text search)"
# Save anchor index before insertion
if target_paragraph_index is not None:
anchor_index = target_paragraph_index
else:
anchor_index = None
for i, p in enumerate(doc.paragraphs):
if p is para:
anchor_index = i
break
# Determine style: use provided or match target
style = line_style if line_style else para.style
new_para = doc.add_paragraph(line_text, style=style)
if position == 'before':
para._element.addprevious(new_para._element)
else:
para._element.addnext(new_para._element)
doc.save(doc_path)
if anchor_index is not None:
return f"Line/paragraph inserted {position} paragraph (index {anchor_index}) with style '{style}'."
else:
return f"Line/paragraph inserted {position} the target paragraph with style '{style}'."
except Exception as e:
return f"Failed to insert line/paragraph: {str(e)}"
def insert_numbered_list_near_text(doc_path: str, target_text: str = None, list_items: list = None, position: str = 'after', target_paragraph_index: int = None) -> str:
"""
Insert a numbered list before or after the target paragraph. Specify by text or paragraph index. Skips TOC paragraphs in text search.
Args:
doc_path: Path to the Word document
target_text: Text to search for in paragraphs (optional if using index)
list_items: List of strings, each as a list item
position: 'before' or 'after' (default: 'after')
target_paragraph_index: Optional paragraph index to use as anchor
Returns:
Status message
"""
import os
from docx import Document
if not os.path.exists(doc_path):
return f"Document {doc_path} does not exist"
try:
doc = Document(doc_path)
found = False
para = None
if target_paragraph_index is not None:
if target_paragraph_index < 0 or target_paragraph_index >= len(doc.paragraphs):
return f"Invalid target_paragraph_index: {target_paragraph_index}. Document has {len(doc.paragraphs)} paragraphs."
para = doc.paragraphs[target_paragraph_index]
found = True
else:
for i, p in enumerate(doc.paragraphs):
# Skip TOC paragraphs
if p.style and p.style.name.lower().startswith("toc"):
continue
if target_text and target_text in p.text:
para = p
found = True
break
if not found or para is None:
return f"Target paragraph not found (by index or text). (TOC paragraphs are skipped in text search)"
# Save anchor index before insertion
if target_paragraph_index is not None:
anchor_index = target_paragraph_index
else:
anchor_index = None
for i, p in enumerate(doc.paragraphs):
if p is para:
anchor_index = i
break
# Robust style selection for numbered list
style_name = None
for candidate in ['List Number', 'List Paragraph', 'Normal']:
try:
_ = doc.styles[candidate]
style_name = candidate
break
except KeyError:
continue
if not style_name:
style_name = None # fallback to default
new_paras = []
for item in (list_items or []):
p = doc.add_paragraph(item, style=style_name)
new_paras.append(p)
# Move the new paragraphs to the correct position
for p in reversed(new_paras):
if position == 'before':
para._element.addprevious(p._element)
else:
para._element.addnext(p._element)
doc.save(doc_path)
if anchor_index is not None:
return f"Numbered list inserted {position} paragraph (index {anchor_index})."
else:
return f"Numbered list inserted {position} the target paragraph."
except Exception as e:
return f"Failed to insert numbered list: {str(e)}"
def is_toc_paragraph(para):
"""Devuelve True si el párrafo tiene un estilo de tabla de contenido (TOC)."""
return para.style and para.style.name.upper().startswith("TOC")
def is_heading_paragraph(para):
"""Devuelve True si el párrafo tiene un estilo de encabezado (Heading 1, Heading 2, etc)."""
return para.style and para.style.name.lower().startswith("heading")
# --- Helper: Get style name from a <w:p> element ---
def get_paragraph_style(el):
from docx.oxml.ns import qn
pPr = el.find(qn('w:pPr'))
if pPr is not None:
pStyle = pPr.find(qn('w:pStyle'))
if pStyle is not None and 'w:val' in pStyle.attrib:
return pStyle.attrib['w:val']
return None
# --- Main: Delete everything under a header until next heading/TOC ---
def delete_block_under_header(doc, header_text):
"""
Remove all elements (paragraphs, tables, etc.) after the header (by text) and before the next heading/TOC (by style).
Returns: (header_element, elements_removed)
"""
# Find the header paragraph by text (like delete_paragraph finds by index)
header_para = None
header_idx = None
for i, para in enumerate(doc.paragraphs):
if para.text.strip().lower() == header_text.strip().lower():
header_para = para
header_idx = i
break
if header_para is None:
return None, 0
# Find the next heading/TOC paragraph to determine the end of the block
end_idx = None
for i in range(header_idx + 1, len(doc.paragraphs)):
para = doc.paragraphs[i]
if para.style and para.style.name.lower().startswith(('heading', 'título', 'toc')):
end_idx = i
break
# If no next heading found, delete until end of document
if end_idx is None:
end_idx = len(doc.paragraphs)
# Remove paragraphs by index (like delete_paragraph does)
removed_count = 0
for i in range(header_idx + 1, end_idx):
if i < len(doc.paragraphs): # Safety check
para = doc.paragraphs[header_idx + 1] # Always remove the first paragraph after header
p = para._p
p.getparent().remove(p)
removed_count += 1
return header_para._p, removed_count
# --- Usage in replace_paragraph_block_below_header ---
def replace_paragraph_block_below_header(
doc_path: str,
header_text: str,
new_paragraphs: list[str],
detect_block_end_fn=None,
new_paragraph_style: str = None
) -> str:
"""
Reemplaza todo el contenido debajo de una cabecera (por texto), hasta el siguiente encabezado/TOC (por estilo).
"""
from docx import Document
import os
if not os.path.exists(doc_path):
return f"Document {doc_path} not found."
doc = Document(doc_path)
# Find the header paragraph first
header_para = None
header_idx = None
for i, para in enumerate(doc.paragraphs):
para_text = para.text.strip().lower()
is_toc = is_toc_paragraph(para)
if para_text == header_text.strip().lower() and not is_toc:
header_para = para
header_idx = i
break
if header_para is None:
return f"Header '{header_text}' not found in document."
# Delete everything under the header using the same document instance
header_el, removed_count = delete_block_under_header(doc, header_text)
# Now insert new paragraphs after the header (which should still be in the document)
style_to_use = new_paragraph_style or "Normal"
# Find the header again after deletion (it should still be there)
current_para = header_para
for text in new_paragraphs:
new_para = doc.add_paragraph(text, style=style_to_use)
current_para._element.addnext(new_para._element)
current_para = new_para
doc.save(doc_path)
return f"Replaced content under '{header_text}' with {len(new_paragraphs)} paragraph(s), style: {style_to_use}, removed {removed_count} elements."
def replace_block_between_manual_anchors(
doc_path: str,
start_anchor,
new_paragraphs: list[str],
end_anchor=None,
new_paragraph_style: str = None
) -> str:
"""
Replace all content (paragraphs) between start_anchor and end_anchor (exclusive).
Anchors can be specified by string (exact paragraph text) or by integer (paragraph index).
If end_anchor is None, deletes until the end of the document.
Inserts new_paragraphs after the start anchor.
"""
from docx import Document
import os
if not os.path.exists(doc_path):
return f"Document {doc_path} not found."
doc = Document(doc_path)
# Find start anchor index
if isinstance(start_anchor, int):
start_idx = start_anchor
if start_idx < 0 or start_idx >= len(doc.paragraphs):
return f"Invalid start_anchor index: {start_idx}. Document has {len(doc.paragraphs)} paragraphs."
else:
start_idx = None
for i, para in enumerate(doc.paragraphs):
if para.text.strip() == str(start_anchor).strip():
start_idx = i
break
if start_idx is None:
return f"Start anchor '{start_anchor}' not found."
# Find end anchor index
if end_anchor is None:
end_idx = len(doc.paragraphs)
elif isinstance(end_anchor, int):
end_idx = end_anchor
if end_idx < 0 or end_idx > len(doc.paragraphs):
return f"Invalid end_anchor index: {end_idx}. Document has {len(doc.paragraphs)} paragraphs."
else:
end_idx = None
for i in range(start_idx + 1, len(doc.paragraphs)):
if doc.paragraphs[i].text.strip() == str(end_anchor).strip():
end_idx = i
break
if end_idx is None:
return f"End anchor '{end_anchor}' not found after start anchor."
# Delete all paragraphs between start_idx and end_idx (exclusive)
removed_count = 0
for i in range(end_idx - 1, start_idx, -1):
p = doc.paragraphs[i]._element
p.getparent().remove(p)
removed_count += 1
# Insert new paragraphs after the start anchor
style_to_use = new_paragraph_style or "Normal"
anchor_para = doc.paragraphs[start_idx]
current_para = anchor_para
for para in new_paragraphs:
if isinstance(para, dict):
text = para.get("text", "")
style = para.get("style", style_to_use)
else:
text = str(para)
style = style_to_use
new_para = doc.add_paragraph(text, style=style)
current_para._element.addnext(new_para._element)
current_para = new_para
doc.save(doc_path)
return f"Replaced content between anchor {start_anchor} and {end_anchor if end_anchor is not None else 'end of document'} with {len(new_paragraphs)} paragraph(s), removed {removed_count} paragraphs."
def convert_document_to_markdown(doc_path: str, output_path: Optional[str] = None, use_pandoc: bool = True) -> str:
"""
Convert a Word document to Markdown format.
Args:
doc_path: Path to the Word document
output_path: Optional path for output file. If None, returns markdown as string
use_pandoc: Whether to use pypandoc (preferred) or fallback to custom implementation
Returns:
Success message if output_path provided, or markdown content as string
"""
if not os.path.exists(doc_path):
return f"Document {doc_path} does not exist"
try:
if use_pandoc:
# Try using pypandoc first (most comprehensive)
try:
import pypandoc
# Convert using pandoc
markdown_content = pypandoc.convert_file(
doc_path,
'md',
format='docx',
extra_args=['--extract-media=./images'] # Extract images to images folder
)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
return f"Successfully converted {doc_path} to {output_path} using pypandoc"
else:
return markdown_content
except ImportError:
# Pypandoc not available, fall back to custom implementation
pass
except Exception as e:
# Pypandoc failed, fall back to custom implementation
print(f"Pypandoc failed: {e}, falling back to custom implementation")
# Custom implementation using python-docx
markdown_content = _convert_docx_to_markdown_custom(doc_path)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_content)
return f"Successfully converted {doc_path} to {output_path} using custom implementation"
else:
return markdown_content
except Exception as e:
return f"Failed to convert document to markdown: {str(e)}"
def _convert_docx_to_markdown_custom(doc_path: str) -> str:
"""
Custom implementation to convert DOCX to Markdown using python-docx.
Handles paragraphs, headings, tables, lists, and basic formatting.
"""
doc = Document(doc_path)
markdown_lines = []
# Process document elements in order
for element in doc.element.body:
if element.tag.endswith('}p'): # Paragraph
para = None
# Find the paragraph object
for p in doc.paragraphs:
if p._element == element:
para = p
break
if para:
md_line = _convert_paragraph_to_markdown(para)
if md_line.strip():
markdown_lines.append(md_line)
elif element.tag.endswith('}tbl'): # Table
table = None
# Find the table object
for t in doc.tables:
if t._element == element:
table = t
break
if table:
md_table = _convert_table_to_markdown(table)
if md_table.strip():
markdown_lines.append(md_table)
markdown_lines.append("") # Add spacing after table
return "\n".join(markdown_lines).strip()
def _convert_paragraph_to_markdown(paragraph) -> str:
"""Convert a paragraph to markdown format."""
text = paragraph.text.strip()
if not text:
return ""
# Check if it's a heading
style_name = paragraph.style.name.lower() if paragraph.style else ""
if "heading" in style_name or "título" in style_name:
# Extract heading level
level = 1
if "1" in style_name:
level = 1
elif "2" in style_name:
level = 2
elif "3" in style_name:
level = 3
elif "4" in style_name:
level = 4
elif "5" in style_name:
level = 5
elif "6" in style_name:
level = 6
return f"{'#' * level} {text}"
# Check for list items
if hasattr(paragraph, '_element') and paragraph._element.pPr is not None:
numPr = paragraph._element.pPr.numPr
if numPr is not None:
# This is a list item
ilvl = numPr.ilvl
level = int(ilvl.val) if ilvl is not None else 0
indent = " " * level
return f"{indent}- {text}"
# Process inline formatting
formatted_text = _process_inline_formatting(paragraph)
return formatted_text
def _process_inline_formatting(paragraph) -> str:
"""Process inline formatting like bold, italic, underline and color."""
result = []
for run in paragraph.runs:
text = run.text
if not text:
continue
# Escape HTML special characters
escaped = html.escape(text)
# Apply markdown formatting for bold/italic, and HTML for underline
# Keep markers inside the color span (so color is outer wrapper)
# Start with the plain escaped text
run_text = escaped
# Italic then bold ordering - preserve both (Markdown allows nesting)
if run.italic:
run_text = f"*{run_text}*"
if run.bold:
run_text = f"**{run_text}**"
if run.underline:
# Markdown has no standard underline; use HTML <u>
run_text = f"<u>{run_text}</u>"
# Detect explicit RGB color and wrap in an HTML span if present
color_hex = _get_run_color_hex(run)
if color_hex:
run_text = f'<span style="color:{color_hex}">{run_text}</span>'
result.append(run_text)
return "".join(result)
def _convert_table_to_markdown(table) -> str:
"""Convert a table to markdown format."""
if not table.rows:
return ""
markdown_rows = []
# Process header row
header_row = table.rows[0]
header_cells = []
for cell in header_row.cells:
cell_text = cell.text.strip().replace('\n', ' ')
header_cells.append(cell_text)
markdown_rows.append("| " + " | ".join(header_cells) + " |")
# Add separator row
separator = "| " + " | ".join(["---"] * len(header_cells)) + " |"
markdown_rows.append(separator)
# Process data rows
for row in table.rows[1:]:
row_cells = []
for cell in row.cells:
cell_text = cell.text.strip().replace('\n', ' ')
row_cells.append(cell_text)
# Ensure row has same number of cells as header
while len(row_cells) < len(header_cells):
row_cells.append("")
markdown_rows.append("| " + " | ".join(row_cells) + " |")
return "\n".join(markdown_rows)
def get_document_markdown_preview(doc_path: str, max_length: int = 1000) -> str:
"""
Get a preview of the document as markdown (first max_length characters).
Args:
doc_path: Path to the Word document
max_length: Maximum length of preview
Returns:
Markdown preview of the document
"""
try:
markdown_content = convert_document_to_markdown(doc_path, output_path=None)
if isinstance(markdown_content, str) and not markdown_content.startswith("Failed"):
if len(markdown_content) > max_length:
return markdown_content[:max_length] + "\n\n... (truncated)"
return markdown_content
else:
return f"Failed to generate preview: {markdown_content}"
except Exception as e:
return f"Failed to generate preview: {str(e)}"
def _get_run_color_hex(run) -> Optional[str]:
"""
Return a CSS hex color string for the run's font color (e.g. '#RRGGBB'),
or None if no explicit RGB color is set.
"""
try:
color = getattr(run.font, "color", None)
if not color:
return None
# Attempt to get an RGBColor object/string exposed by python-docx
rgb = getattr(color, "rgb", None)
if rgb:
# rgb may be an RGBColor object; str(rgb) usually yields 'RRGGBB'
hexstr = str(rgb)
# Ensure 6 hex digits (python-docx typically returns 6 hex chars)
hexstr = hexstr.strip()
if len(hexstr) == 6:
return "#" + hexstr
# If it already had a '#'
if hexstr.startswith("#") and len(hexstr) == 7:
return hexstr
# Some cases: color may expose theme_color; we don't attempt to map theme to hex here.
except Exception:
return None
return None