docs_structure.py•12.1 kB
"""
Google Docs Document Structure Parsing and Analysis
This module provides utilities for parsing and analyzing the structure
of Google Docs documents, including finding tables, cells, and other elements.
"""
import logging
from typing import Any, Optional
logger = logging.getLogger(__name__)
def parse_document_structure(doc_data: dict[str, Any]) -> dict[str, Any]:
"""
Parse the full document structure into a navigable format.
Args:
doc_data: Raw document data from Google Docs API
Returns:
Dictionary containing parsed structure with elements and their positions
"""
structure = {
'title': doc_data.get('title', ''),
'body': [],
'tables': [],
'headers': {},
'footers': {},
'total_length': 0
}
body = doc_data.get('body', {})
content = body.get('content', [])
for element in content:
element_info = _parse_element(element)
if element_info:
structure['body'].append(element_info)
if element_info['type'] == 'table':
structure['tables'].append(element_info)
# Calculate total document length
if structure['body']:
last_element = structure['body'][-1]
structure['total_length'] = last_element.get('end_index', 0)
# Parse headers and footers
for header_id, header_data in doc_data.get('headers', {}).items():
structure['headers'][header_id] = _parse_segment(header_data)
for footer_id, footer_data in doc_data.get('footers', {}).items():
structure['footers'][footer_id] = _parse_segment(footer_data)
return structure
def _parse_element(element: dict[str, Any]) -> Optional[dict[str, Any]]:
"""
Parse a single document element.
Args:
element: Element data from document
Returns:
Parsed element information or None
"""
element_info = {
'start_index': element.get('startIndex', 0),
'end_index': element.get('endIndex', 0)
}
if 'paragraph' in element:
paragraph = element['paragraph']
element_info['type'] = 'paragraph'
element_info['text'] = _extract_paragraph_text(paragraph)
element_info['style'] = paragraph.get('paragraphStyle', {})
elif 'table' in element:
table = element['table']
element_info['type'] = 'table'
element_info['rows'] = len(table.get('tableRows', []))
element_info['columns'] = len(table.get('tableRows', [{}])[0].get('tableCells', []))
element_info['cells'] = _parse_table_cells(table)
element_info['table_style'] = table.get('tableStyle', {})
elif 'sectionBreak' in element:
element_info['type'] = 'section_break'
element_info['section_style'] = element['sectionBreak'].get('sectionStyle', {})
elif 'tableOfContents' in element:
element_info['type'] = 'table_of_contents'
else:
return None
return element_info
def _parse_table_cells(table: dict[str, Any]) -> list[list[dict[str, Any]]]:
"""
Parse table cells with their positions and content.
Args:
table: Table element data
Returns:
2D list of cell information
"""
cells = []
for row_idx, row in enumerate(table.get('tableRows', [])):
row_cells = []
for col_idx, cell in enumerate(row.get('tableCells', [])):
# Find the first paragraph in the cell for insertion
insertion_index = cell.get('startIndex', 0) + 1 # Default fallback
# Look for the first paragraph in cell content
content_elements = cell.get('content', [])
for element in content_elements:
if 'paragraph' in element:
paragraph = element['paragraph']
# Get the first element in the paragraph
para_elements = paragraph.get('elements', [])
if para_elements:
first_element = para_elements[0]
if 'startIndex' in first_element:
insertion_index = first_element['startIndex']
break
cell_info = {
'row': row_idx,
'column': col_idx,
'start_index': cell.get('startIndex', 0),
'end_index': cell.get('endIndex', 0),
'insertion_index': insertion_index, # Where to insert text in this cell
'content': _extract_cell_text(cell),
'content_elements': content_elements
}
row_cells.append(cell_info)
cells.append(row_cells)
return cells
def _extract_paragraph_text(paragraph: dict[str, Any]) -> str:
"""Extract text from a paragraph element."""
text_parts = []
for element in paragraph.get('elements', []):
if 'textRun' in element:
text_parts.append(element['textRun'].get('content', ''))
return ''.join(text_parts)
def _extract_cell_text(cell: dict[str, Any]) -> str:
"""Extract text content from a table cell."""
text_parts = []
for element in cell.get('content', []):
if 'paragraph' in element:
text_parts.append(_extract_paragraph_text(element['paragraph']))
return ''.join(text_parts)
def _parse_segment(segment_data: dict[str, Any]) -> dict[str, Any]:
"""Parse a document segment (header/footer)."""
return {
'content': segment_data.get('content', []),
'start_index': segment_data.get('content', [{}])[0].get('startIndex', 0) if segment_data.get('content') else 0,
'end_index': segment_data.get('content', [{}])[-1].get('endIndex', 0) if segment_data.get('content') else 0
}
def find_tables(doc_data: dict[str, Any]) -> list[dict[str, Any]]:
"""
Find all tables in the document with their positions and dimensions.
Args:
doc_data: Raw document data from Google Docs API
Returns:
List of table information dictionaries
"""
tables = []
structure = parse_document_structure(doc_data)
for idx, table_info in enumerate(structure['tables']):
tables.append({
'index': idx,
'start_index': table_info['start_index'],
'end_index': table_info['end_index'],
'rows': table_info['rows'],
'columns': table_info['columns'],
'cells': table_info['cells']
})
return tables
def get_table_cell_indices(doc_data: dict[str, Any], table_index: int = 0) -> Optional[list[list[tuple[int, int]]]]:
"""
Get content indices for all cells in a specific table.
Args:
doc_data: Raw document data from Google Docs API
table_index: Index of the table (0-based)
Returns:
2D list of (start_index, end_index) tuples for each cell, or None if table not found
"""
tables = find_tables(doc_data)
if table_index >= len(tables):
logger.warning(f"Table index {table_index} not found. Document has {len(tables)} tables.")
return None
table = tables[table_index]
cell_indices = []
for row in table['cells']:
row_indices = []
for cell in row:
# Each cell contains at least one paragraph
# Find the first paragraph in the cell for content insertion
cell_content = cell.get('content_elements', [])
if cell_content:
# Look for the first paragraph in cell content
first_para = None
for element in cell_content:
if 'paragraph' in element:
first_para = element['paragraph']
break
if first_para and 'elements' in first_para and first_para['elements']:
# Insert at the start of the first text run in the paragraph
first_text_element = first_para['elements'][0]
if 'textRun' in first_text_element:
start_idx = first_text_element.get('startIndex', cell['start_index'] + 1)
end_idx = first_text_element.get('endIndex', start_idx + 1)
row_indices.append((start_idx, end_idx))
continue
# Fallback: use cell boundaries with safe margins
content_start = cell['start_index'] + 1
content_end = cell['end_index'] - 1
row_indices.append((content_start, content_end))
cell_indices.append(row_indices)
return cell_indices
def find_element_at_index(doc_data: dict[str, Any], index: int) -> Optional[dict[str, Any]]:
"""
Find what element exists at a given index in the document.
Args:
doc_data: Raw document data from Google Docs API
index: Position in the document
Returns:
Information about the element at that position, or None
"""
structure = parse_document_structure(doc_data)
for element in structure['body']:
if element['start_index'] <= index < element['end_index']:
element_copy = element.copy()
# If it's a table, find which cell contains the index
if element['type'] == 'table' and 'cells' in element:
for row_idx, row in enumerate(element['cells']):
for col_idx, cell in enumerate(row):
if cell['start_index'] <= index < cell['end_index']:
element_copy['containing_cell'] = {
'row': row_idx,
'column': col_idx,
'cell_start': cell['start_index'],
'cell_end': cell['end_index']
}
break
return element_copy
return None
def get_next_paragraph_index(doc_data: dict[str, Any], after_index: int = 0) -> int:
"""
Find the next safe position to insert content after a given index.
Args:
doc_data: Raw document data from Google Docs API
after_index: Index after which to find insertion point
Returns:
Safe index for insertion
"""
structure = parse_document_structure(doc_data)
# Find the first paragraph element after the given index
for element in structure['body']:
if element['type'] == 'paragraph' and element['start_index'] > after_index:
# Insert at the end of the previous element or start of this paragraph
return element['start_index']
# If no paragraph found, return the end of document
return structure['total_length'] - 1 if structure['total_length'] > 0 else 1
def analyze_document_complexity(doc_data: dict[str, Any]) -> dict[str, Any]:
"""
Analyze document complexity and provide statistics.
Args:
doc_data: Raw document data from Google Docs API
Returns:
Dictionary with document statistics
"""
structure = parse_document_structure(doc_data)
stats = {
'total_elements': len(structure['body']),
'tables': len(structure['tables']),
'paragraphs': sum(1 for e in structure['body'] if e.get('type') == 'paragraph'),
'section_breaks': sum(1 for e in structure['body'] if e.get('type') == 'section_break'),
'total_length': structure['total_length'],
'has_headers': bool(structure['headers']),
'has_footers': bool(structure['footers'])
}
# Add table statistics
if structure['tables']:
total_cells = sum(
table['rows'] * table['columns']
for table in structure['tables']
)
stats['total_table_cells'] = total_cells
stats['largest_table'] = max(
(t['rows'] * t['columns'] for t in structure['tables']),
default=0
)
return stats