Skip to main content
Glama
document_handler.py10.6 kB
"""Document handler for DOCX file operations. This module provides the core document handling functionality including reading, writing, validation, and metadata extraction. """ import io from pathlib import Path from typing import Any, BinaryIO from docx import Document from src.core.constants import SUPPORTED_FORMATS from src.core.exceptions import InvalidDocumentError, UnsupportedFormatError from src.models.dto import DocumentMetadataDTO class DocumentHandler: """Handler for DOCX document operations. This class provides methods for reading, writing, validating, and extracting information from DOCX files. """ def __init__(self) -> None: """Initialize the document handler.""" self._document: Document | None = None self._file_path: str | None = None @property def document(self) -> Document: """Get the current document. Returns: The current Document instance. Raises: InvalidDocumentError: If no document is loaded. """ if self._document is None: raise InvalidDocumentError("No document loaded") return self._document def create_document(self) -> Document: """Create a new empty DOCX document. Returns: A new Document instance. """ self._document = Document() self._file_path = None return self._document def open_document(self, file_path: str | Path) -> Document: """Open a DOCX document from a file path. Args: file_path: Path to the DOCX file. Returns: The opened Document instance. Raises: InvalidDocumentError: If the file cannot be opened. UnsupportedFormatError: If the file format is not supported. """ path = Path(file_path) if not path.exists(): raise InvalidDocumentError(f"File not found: {file_path}") ext = path.suffix.lower() if ext not in SUPPORTED_FORMATS: raise UnsupportedFormatError( f"Unsupported format: {ext}", format_=ext, supported_formats=list(SUPPORTED_FORMATS.keys()), ) try: self._document = Document(str(path)) self._file_path = str(path) return self._document except Exception as e: raise InvalidDocumentError(f"Failed to open document: {e}") def open_from_bytes(self, content: bytes) -> Document: """Open a DOCX document from bytes. Args: content: Document content as bytes. Returns: The opened Document instance. Raises: InvalidDocumentError: If the content cannot be parsed. """ try: self._document = Document(io.BytesIO(content)) self._file_path = None return self._document except Exception as e: raise InvalidDocumentError(f"Failed to parse document: {e}") def open_from_stream(self, stream: BinaryIO) -> Document: """Open a DOCX document from a file-like object. Args: stream: File-like object containing document data. Returns: The opened Document instance. Raises: InvalidDocumentError: If the stream cannot be parsed. """ try: self._document = Document(stream) self._file_path = None return self._document except Exception as e: raise InvalidDocumentError(f"Failed to parse document stream: {e}") def save_document(self, file_path: str | Path | None = None) -> str: """Save the document to a file. Args: file_path: Path to save the document. If None, saves to original path. Returns: The path where the document was saved. Raises: InvalidDocumentError: If no document is loaded or no path provided. """ if self._document is None: raise InvalidDocumentError("No document to save") save_path = file_path or self._file_path if save_path is None: raise InvalidDocumentError("No file path provided") path = Path(save_path) path.parent.mkdir(parents=True, exist_ok=True) self._document.save(str(path)) self._file_path = str(path) return self._file_path def save_to_bytes(self) -> bytes: """Save the document to bytes. Returns: Document content as bytes. Raises: InvalidDocumentError: If no document is loaded. """ if self._document is None: raise InvalidDocumentError("No document to save") buffer = io.BytesIO() self._document.save(buffer) buffer.seek(0) return buffer.read() def validate_document(self, file_path: str | Path) -> bool: """Validate if a file is a valid DOCX document. Args: file_path: Path to the file to validate. Returns: True if the file is a valid DOCX document. Raises: InvalidDocumentError: If validation fails. """ try: Document(str(file_path)) return True except Exception as e: raise InvalidDocumentError(f"Invalid document: {e}") def validate_bytes(self, content: bytes) -> bool: """Validate if bytes represent a valid DOCX document. Args: content: Document content as bytes. Returns: True if the content is a valid DOCX document. Raises: InvalidDocumentError: If validation fails. """ try: Document(io.BytesIO(content)) return True except Exception as e: raise InvalidDocumentError(f"Invalid document content: {e}") def get_metadata(self) -> DocumentMetadataDTO: """Extract metadata from the current document. Returns: Document metadata DTO. Raises: InvalidDocumentError: If no document is loaded. """ doc = self.document core_props = doc.core_properties return DocumentMetadataDTO( author=core_props.author, title=core_props.title, subject=core_props.subject, keywords=core_props.keywords, comments=core_props.comments, category=core_props.category, created=core_props.created, modified=core_props.modified, last_modified_by=core_props.last_modified_by, revision=core_props.revision, ) def set_metadata( self, author: str | None = None, title: str | None = None, subject: str | None = None, keywords: str | None = None, comments: str | None = None, category: str | None = None, ) -> None: """Set document metadata. Args: author: Document author. title: Document title. subject: Document subject. keywords: Document keywords. comments: Document comments. category: Document category. Raises: InvalidDocumentError: If no document is loaded. """ doc = self.document core_props = doc.core_properties if author is not None: core_props.author = author if title is not None: core_props.title = title if subject is not None: core_props.subject = subject if keywords is not None: core_props.keywords = keywords if comments is not None: core_props.comments = comments if category is not None: core_props.category = category def get_paragraph_count(self) -> int: """Get the number of paragraphs in the document. Returns: Number of paragraphs. Raises: InvalidDocumentError: If no document is loaded. """ return len(self.document.paragraphs) def get_table_count(self) -> int: """Get the number of tables in the document. Returns: Number of tables. Raises: InvalidDocumentError: If no document is loaded. """ return len(self.document.tables) def get_section_count(self) -> int: """Get the number of sections in the document. Returns: Number of sections. Raises: InvalidDocumentError: If no document is loaded. """ return len(self.document.sections) def get_document_structure(self) -> dict[str, Any]: """Get a summary of the document structure. Returns: Dictionary with document structure information. Raises: InvalidDocumentError: If no document is loaded. """ doc = self.document return { "paragraphs": len(doc.paragraphs), "tables": len(doc.tables), "sections": len(doc.sections), "styles": len(doc.styles), "inline_shapes": len(doc.inline_shapes), } def get_all_text(self) -> str: """Get all text content from the document. Returns: Concatenated text from all paragraphs. Raises: InvalidDocumentError: If no document is loaded. """ return "\n".join(para.text for para in self.document.paragraphs) def get_word_count(self) -> int: """Get the word count of the document. Returns: Number of words in the document. Raises: InvalidDocumentError: If no document is loaded. """ text = self.get_all_text() return len(text.split()) def get_character_count(self, include_spaces: bool = True) -> int: """Get the character count of the document. Args: include_spaces: Whether to include spaces in the count. Returns: Number of characters in the document. Raises: InvalidDocumentError: If no document is loaded. """ text = self.get_all_text() if include_spaces: return len(text) return len(text.replace(" ", "").replace("\n", "")) def close(self) -> None: """Close the current document and release resources.""" self._document = None self._file_path = None

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Fu-Jie/MCP-OPENAPI-DOCX'

If you have feedback or need assistance with the MCP directory API, please join our Discord server