Skip to main content
Glama
table_analysis.pyโ€ข14.2 kB
"""Data models for comprehensive table structure and style analysis.""" from dataclasses import dataclass from typing import List, Optional, Dict, Any, Tuple from enum import Enum class CellMergeType(Enum): """Types of cell merging.""" NONE = "none" HORIZONTAL = "horizontal" # Cell spans multiple columns VERTICAL = "vertical" # Cell spans multiple rows BOTH = "both" # Cell spans both rows and columns @dataclass class MergeInfo: """Information about cell merging.""" merge_type: CellMergeType start_row: int end_row: int start_col: int end_col: int span_rows: int span_cols: int @dataclass class CellStyleAnalysis: """Comprehensive analysis of a single cell's styling.""" # Position information row_index: int column_index: int # Content text_content: str is_empty: bool # Merge information merge_info: Optional[MergeInfo] # Text formatting font_family: Optional[str] font_size: Optional[int] font_color: Optional[str] is_bold: bool is_italic: bool is_underlined: bool is_strikethrough: bool # Alignment horizontal_alignment: Optional[str] # left, center, right, justify vertical_alignment: Optional[str] # top, middle, bottom # Background and borders background_color: Optional[str] # Border information for each side top_border: Optional[Dict[str, str]] # style, width, color bottom_border: Optional[Dict[str, str]] left_border: Optional[Dict[str, str]] right_border: Optional[Dict[str, str]] # Cell dimensions (if available) width: Optional[float] # In points or inches height: Optional[float] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" result = { "position": { "row": self.row_index, "column": self.column_index }, "content": { "text": self.text_content, "is_empty": self.is_empty }, "text_format": { "font_family": self.font_family, "font_size": self.font_size, "font_color": self.font_color, "bold": self.is_bold, "italic": self.is_italic, "underlined": self.is_underlined, "strikethrough": self.is_strikethrough }, "alignment": { "horizontal": self.horizontal_alignment, "vertical": self.vertical_alignment }, "background": { "color": self.background_color }, "borders": { "top": self.top_border, "bottom": self.bottom_border, "left": self.left_border, "right": self.right_border } } if self.merge_info: result["merge"] = { "type": self.merge_info.merge_type.value, "start_row": self.merge_info.start_row, "end_row": self.merge_info.end_row, "start_col": self.merge_info.start_col, "end_col": self.merge_info.end_col, "span_rows": self.merge_info.span_rows, "span_cols": self.merge_info.span_cols } else: result["merge"] = None if self.width is not None or self.height is not None: result["dimensions"] = { "width": self.width, "height": self.height } return result @dataclass class TableStructureAnalysis: """Comprehensive analysis of table structure and styling.""" # Basic table information table_index: int total_rows: int total_columns: int # Table-level properties table_style_name: Optional[str] table_alignment: Optional[str] # left, center, right table_width: Optional[float] # Header information has_header_row: bool header_row_index: Optional[int] header_cells: Optional[List[str]] # All cell analyses cells: List[List[CellStyleAnalysis]] # [row][column] # Merge summary merged_cells_count: int merge_regions: List[MergeInfo] # Style consistency analysis consistent_fonts: bool consistent_alignment: bool consistent_borders: bool # Common styles found unique_font_families: List[str] unique_font_sizes: List[int] unique_colors: List[str] unique_background_colors: List[str] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" return { "table_info": { "index": self.table_index, "rows": self.total_rows, "columns": self.total_columns, "style_name": self.table_style_name, "alignment": self.table_alignment, "width": self.table_width }, "header_info": { "has_header": self.has_header_row, "header_row_index": self.header_row_index, "header_cells": self.header_cells }, "cells": [ [cell.to_dict() for cell in row] for row in self.cells ], "merge_analysis": { "merged_cells_count": self.merged_cells_count, "merge_regions": [ { "type": merge.merge_type.value, "start_row": merge.start_row, "end_row": merge.end_row, "start_col": merge.start_col, "end_col": merge.end_col, "span_rows": merge.span_rows, "span_cols": merge.span_cols } for merge in self.merge_regions ] }, "style_consistency": { "fonts": self.consistent_fonts, "alignment": self.consistent_alignment, "borders": self.consistent_borders }, "style_summary": { "font_families": self.unique_font_families, "font_sizes": self.unique_font_sizes, "colors": self.unique_colors, "background_colors": self.unique_background_colors } } @dataclass class TableAnalysisResult: """Result of comprehensive table analysis.""" file_path: str total_tables: int analysis_timestamp: str tables: List[TableStructureAnalysis] def to_dict(self) -> Dict[str, Any]: """Convert to dictionary representation.""" return { "file_info": { "path": self.file_path, "total_tables": self.total_tables, "analysis_timestamp": self.analysis_timestamp }, "tables": [table.to_dict() for table in self.tables] } # Helper functions for analysis def analyze_cell_merge(cell, row_idx: int, col_idx: int) -> Optional[MergeInfo]: """Analyze if a cell is part of a merge and return merge information.""" try: # Check if cell is merged if hasattr(cell, '_element'): tc_element = cell._element # Check for gridSpan (horizontal merge) grid_span = tc_element.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}gridSpan') # Check for vMerge (vertical merge) tc_pr = tc_element.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tcPr') v_merge = None if tc_pr is not None: v_merge_elem = tc_pr.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}vMerge') if v_merge_elem is not None: v_merge = v_merge_elem.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') # Determine merge type and span span_cols = int(grid_span) if grid_span else 1 span_rows = 1 # Default for horizontal-only merge # For vertical merge, we need to count consecutive vMerge="restart" cells if v_merge == "restart": # This is the start of a vertical merge # Count how many cells below have vMerge="continue" span_rows = 1 # Note: This is a simplified implementation # In a full implementation, we'd need to traverse the table structure elif v_merge == "continue": # This cell is part of a vertical merge but not the start # We can't determine the full span from here span_rows = 1 # Check if this cell is part of a merged region # For python-docx, merged cells have either gridSpan > 1 or vMerge attributes if span_cols > 1 or v_merge is not None: merge_type = CellMergeType.NONE if span_cols > 1 and v_merge is not None: merge_type = CellMergeType.BOTH elif span_cols > 1: merge_type = CellMergeType.HORIZONTAL elif v_merge is not None: merge_type = CellMergeType.VERTICAL return MergeInfo( merge_type=merge_type, start_row=row_idx, end_row=row_idx + span_rows - 1, start_col=col_idx, end_col=col_idx + span_cols - 1, span_rows=span_rows, span_cols=span_cols ) except Exception: pass return None def extract_cell_formatting(cell) -> Dict[str, Any]: """Extract comprehensive formatting information from a cell.""" formatting = { "font_family": None, "font_size": None, "font_color": None, "is_bold": False, "is_italic": False, "is_underlined": False, "is_strikethrough": False, "horizontal_alignment": None, "vertical_alignment": None, "background_color": None, "borders": { "top": None, "bottom": None, "left": None, "right": None } } try: # Get the first paragraph and run for text formatting if cell.paragraphs: paragraph = cell.paragraphs[0] # Paragraph alignment if paragraph.alignment is not None: alignment_map = { 0: "left", 1: "center", 2: "right", 3: "justify" } formatting["horizontal_alignment"] = alignment_map.get(paragraph.alignment) # Run formatting (text properties) if paragraph.runs: run = paragraph.runs[0] if run.font.name: formatting["font_family"] = run.font.name if run.font.size: formatting["font_size"] = run.font.size.pt if run.font.color and run.font.color.rgb: formatting["font_color"] = str(run.font.color.rgb) formatting["is_bold"] = run.bold or False formatting["is_italic"] = run.italic or False formatting["is_underlined"] = run.underline or False # Cell-level formatting (background, borders) if hasattr(cell, '_element'): tc_element = cell._element tc_pr = tc_element.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tcPr') if tc_pr is not None: # Background color (shading) shd = tc_pr.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}shd') if shd is not None: fill = shd.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}fill') if fill: formatting["background_color"] = fill # Vertical alignment v_align = tc_pr.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}vAlign') if v_align is not None: val = v_align.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') formatting["vertical_alignment"] = val # Borders tc_borders = tc_pr.find('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}tcBorders') if tc_borders is not None: ns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' for border_side in ["top", "bottom", "left", "right"]: # Try multiple methods to find border elements border_elem = tc_borders.find(f'.//{{{ns}}}{border_side}') if border_elem is None: # Fallback: direct child search for child in tc_borders: if child.tag == f'{{{ns}}}{border_side}': border_elem = child break if border_elem is not None: border_info = { "style": border_elem.get(f'{{{ns}}}val'), "width": border_elem.get(f'{{{ns}}}sz'), "color": border_elem.get(f'{{{ns}}}color') } formatting["borders"][border_side] = border_info except Exception: # If any error occurs, return the partial formatting extracted pass return formatting

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Rookie0x80/docx-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server