Skip to main content
Glama

Google Workspace MCP Server - Control Gmail, Calendar, Docs, Sheets, Slides, Chat, Forms & Drive

docs_structure.py12.1 kB
""" Google Docs Document Structure Parsing and Analysis This module provides utilities for parsing and analyzing the structure of Google Docs documents, including finding tables, cells, and other elements. """ import logging from typing import Any, Optional logger = logging.getLogger(__name__) def parse_document_structure(doc_data: dict[str, Any]) -> dict[str, Any]: """ Parse the full document structure into a navigable format. Args: doc_data: Raw document data from Google Docs API Returns: Dictionary containing parsed structure with elements and their positions """ structure = { 'title': doc_data.get('title', ''), 'body': [], 'tables': [], 'headers': {}, 'footers': {}, 'total_length': 0 } body = doc_data.get('body', {}) content = body.get('content', []) for element in content: element_info = _parse_element(element) if element_info: structure['body'].append(element_info) if element_info['type'] == 'table': structure['tables'].append(element_info) # Calculate total document length if structure['body']: last_element = structure['body'][-1] structure['total_length'] = last_element.get('end_index', 0) # Parse headers and footers for header_id, header_data in doc_data.get('headers', {}).items(): structure['headers'][header_id] = _parse_segment(header_data) for footer_id, footer_data in doc_data.get('footers', {}).items(): structure['footers'][footer_id] = _parse_segment(footer_data) return structure def _parse_element(element: dict[str, Any]) -> Optional[dict[str, Any]]: """ Parse a single document element. Args: element: Element data from document Returns: Parsed element information or None """ element_info = { 'start_index': element.get('startIndex', 0), 'end_index': element.get('endIndex', 0) } if 'paragraph' in element: paragraph = element['paragraph'] element_info['type'] = 'paragraph' element_info['text'] = _extract_paragraph_text(paragraph) element_info['style'] = paragraph.get('paragraphStyle', {}) elif 'table' in element: table = element['table'] element_info['type'] = 'table' element_info['rows'] = len(table.get('tableRows', [])) element_info['columns'] = len(table.get('tableRows', [{}])[0].get('tableCells', [])) element_info['cells'] = _parse_table_cells(table) element_info['table_style'] = table.get('tableStyle', {}) elif 'sectionBreak' in element: element_info['type'] = 'section_break' element_info['section_style'] = element['sectionBreak'].get('sectionStyle', {}) elif 'tableOfContents' in element: element_info['type'] = 'table_of_contents' else: return None return element_info def _parse_table_cells(table: dict[str, Any]) -> list[list[dict[str, Any]]]: """ Parse table cells with their positions and content. Args: table: Table element data Returns: 2D list of cell information """ cells = [] for row_idx, row in enumerate(table.get('tableRows', [])): row_cells = [] for col_idx, cell in enumerate(row.get('tableCells', [])): # Find the first paragraph in the cell for insertion insertion_index = cell.get('startIndex', 0) + 1 # Default fallback # Look for the first paragraph in cell content content_elements = cell.get('content', []) for element in content_elements: if 'paragraph' in element: paragraph = element['paragraph'] # Get the first element in the paragraph para_elements = paragraph.get('elements', []) if para_elements: first_element = para_elements[0] if 'startIndex' in first_element: insertion_index = first_element['startIndex'] break cell_info = { 'row': row_idx, 'column': col_idx, 'start_index': cell.get('startIndex', 0), 'end_index': cell.get('endIndex', 0), 'insertion_index': insertion_index, # Where to insert text in this cell 'content': _extract_cell_text(cell), 'content_elements': content_elements } row_cells.append(cell_info) cells.append(row_cells) return cells def _extract_paragraph_text(paragraph: dict[str, Any]) -> str: """Extract text from a paragraph element.""" text_parts = [] for element in paragraph.get('elements', []): if 'textRun' in element: text_parts.append(element['textRun'].get('content', '')) return ''.join(text_parts) def _extract_cell_text(cell: dict[str, Any]) -> str: """Extract text content from a table cell.""" text_parts = [] for element in cell.get('content', []): if 'paragraph' in element: text_parts.append(_extract_paragraph_text(element['paragraph'])) return ''.join(text_parts) def _parse_segment(segment_data: dict[str, Any]) -> dict[str, Any]: """Parse a document segment (header/footer).""" return { 'content': segment_data.get('content', []), 'start_index': segment_data.get('content', [{}])[0].get('startIndex', 0) if segment_data.get('content') else 0, 'end_index': segment_data.get('content', [{}])[-1].get('endIndex', 0) if segment_data.get('content') else 0 } def find_tables(doc_data: dict[str, Any]) -> list[dict[str, Any]]: """ Find all tables in the document with their positions and dimensions. Args: doc_data: Raw document data from Google Docs API Returns: List of table information dictionaries """ tables = [] structure = parse_document_structure(doc_data) for idx, table_info in enumerate(structure['tables']): tables.append({ 'index': idx, 'start_index': table_info['start_index'], 'end_index': table_info['end_index'], 'rows': table_info['rows'], 'columns': table_info['columns'], 'cells': table_info['cells'] }) return tables def get_table_cell_indices(doc_data: dict[str, Any], table_index: int = 0) -> Optional[list[list[tuple[int, int]]]]: """ Get content indices for all cells in a specific table. Args: doc_data: Raw document data from Google Docs API table_index: Index of the table (0-based) Returns: 2D list of (start_index, end_index) tuples for each cell, or None if table not found """ tables = find_tables(doc_data) if table_index >= len(tables): logger.warning(f"Table index {table_index} not found. Document has {len(tables)} tables.") return None table = tables[table_index] cell_indices = [] for row in table['cells']: row_indices = [] for cell in row: # Each cell contains at least one paragraph # Find the first paragraph in the cell for content insertion cell_content = cell.get('content_elements', []) if cell_content: # Look for the first paragraph in cell content first_para = None for element in cell_content: if 'paragraph' in element: first_para = element['paragraph'] break if first_para and 'elements' in first_para and first_para['elements']: # Insert at the start of the first text run in the paragraph first_text_element = first_para['elements'][0] if 'textRun' in first_text_element: start_idx = first_text_element.get('startIndex', cell['start_index'] + 1) end_idx = first_text_element.get('endIndex', start_idx + 1) row_indices.append((start_idx, end_idx)) continue # Fallback: use cell boundaries with safe margins content_start = cell['start_index'] + 1 content_end = cell['end_index'] - 1 row_indices.append((content_start, content_end)) cell_indices.append(row_indices) return cell_indices def find_element_at_index(doc_data: dict[str, Any], index: int) -> Optional[dict[str, Any]]: """ Find what element exists at a given index in the document. Args: doc_data: Raw document data from Google Docs API index: Position in the document Returns: Information about the element at that position, or None """ structure = parse_document_structure(doc_data) for element in structure['body']: if element['start_index'] <= index < element['end_index']: element_copy = element.copy() # If it's a table, find which cell contains the index if element['type'] == 'table' and 'cells' in element: for row_idx, row in enumerate(element['cells']): for col_idx, cell in enumerate(row): if cell['start_index'] <= index < cell['end_index']: element_copy['containing_cell'] = { 'row': row_idx, 'column': col_idx, 'cell_start': cell['start_index'], 'cell_end': cell['end_index'] } break return element_copy return None def get_next_paragraph_index(doc_data: dict[str, Any], after_index: int = 0) -> int: """ Find the next safe position to insert content after a given index. Args: doc_data: Raw document data from Google Docs API after_index: Index after which to find insertion point Returns: Safe index for insertion """ structure = parse_document_structure(doc_data) # Find the first paragraph element after the given index for element in structure['body']: if element['type'] == 'paragraph' and element['start_index'] > after_index: # Insert at the end of the previous element or start of this paragraph return element['start_index'] # If no paragraph found, return the end of document return structure['total_length'] - 1 if structure['total_length'] > 0 else 1 def analyze_document_complexity(doc_data: dict[str, Any]) -> dict[str, Any]: """ Analyze document complexity and provide statistics. Args: doc_data: Raw document data from Google Docs API Returns: Dictionary with document statistics """ structure = parse_document_structure(doc_data) stats = { 'total_elements': len(structure['body']), 'tables': len(structure['tables']), 'paragraphs': sum(1 for e in structure['body'] if e.get('type') == 'paragraph'), 'section_breaks': sum(1 for e in structure['body'] if e.get('type') == 'section_break'), 'total_length': structure['total_length'], 'has_headers': bool(structure['headers']), 'has_footers': bool(structure['footers']) } # Add table statistics if structure['tables']: total_cells = sum( table['rows'] * table['columns'] for table in structure['tables'] ) stats['total_table_cells'] = total_cells stats['largest_table'] = max( (t['rows'] * t['columns'] for t in structure['tables']), default=0 ) return stats

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/taylorwilsdon/google_workspace_mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server