google-workspace-unlimited

Overview Schema Related Servers Score Discussions

line_parser.py•6.65 KiB

""" Document line parsing utilities for Google Docs. Handles parsing of Google Docs structure to identify line positions and character indices for targeted editing operations. """ import logging from typing import Dict, List, Optional logger = logging.getLogger(__name__) class DocumentLine: """Represents a single line in a Google Doc with its position metadata.""" def __init__( self, line_number: int, start_index: int, end_index: int, content: str ): self.line_number = line_number self.start_index = start_index self.end_index = end_index self.content = content def __repr__(self): return f"DocumentLine(line={self.line_number}, start={self.start_index}, end={self.end_index}, content_len={len(self.content)})" def parse_document_lines(doc_data: Dict) -> List[DocumentLine]: """ Parse a Google Doc's structure to extract line-by-line positions. Google Docs API returns a hierarchical structure with paragraphs and text runs. This function flattens it into a line-based view with character indices. Args: doc_data: The document data from docs_service.documents().get() Returns: List[DocumentLine]: List of lines with their positions and content """ lines: List[DocumentLine] = [] body_content = doc_data.get("body", {}).get("content", []) current_line_number = 1 for element in body_content: if "paragraph" in element: paragraph = element.get("paragraph", {}) para_elements = paragraph.get("elements", []) # Collect all text from this paragraph paragraph_text = "" start_index = element.get("startIndex", 0) end_index = element.get("endIndex", 0) for pe in para_elements: text_run = pe.get("textRun", {}) if text_run and "content" in text_run: paragraph_text += text_run["content"] # Split paragraph into lines if it contains newlines if paragraph_text: # Each paragraph in Google Docs ends with \n, so we split carefully text_lines = paragraph_text.split("\n") # Calculate approximate character positions for each line current_pos = start_index for i, line_text in enumerate(text_lines): if ( line_text or i < len(text_lines) - 1 ): # Include empty lines except trailing line_end = current_pos + len(line_text) + 1 # +1 for newline lines.append( DocumentLine( line_number=current_line_number, start_index=current_pos, end_index=line_end, content=line_text, ) ) current_line_number += 1 current_pos = line_end logger.info(f"[parse_document_lines] Parsed {len(lines)} lines from document") return lines def find_line_position(lines: List[DocumentLine], target_line: int) -> Optional[int]: """ Find the character index for a specific line number. Args: lines: List of DocumentLine objects from parse_document_lines target_line: The line number to find (1-based) Returns: Optional[int]: The character index where the line starts, or None if not found """ for line in lines: if line.line_number == target_line: return line.start_index logger.warning(f"[find_line_position] Line {target_line} not found in document") return None def get_document_end_index(doc_data: Dict) -> int: """ Get the end index of the document (for appending content). Args: doc_data: The document data from docs_service.documents().get() Returns: int: The character index at the end of the document """ body_content = doc_data.get("body", {}).get("content", []) end_index = 1 # Minimum is 1 (Google Docs always has at least one character) for element in body_content: if "endIndex" in element: end_index = max(end_index, element["endIndex"]) return end_index def extract_document_text(doc_data: Dict) -> str: """ Extract all text content from a Google Doc as a continuous string. This function comprehensively extracts text from all content types: - Paragraphs - Tables (including nested cells) - Lists - Any other structural elements Args: doc_data: The document data from docs_service.documents().get() Returns: str: The complete document text as a single string """ def extract_from_element(element: Dict) -> str: """Recursively extract text from a document element.""" text_content = "" # Handle paragraph elements if "paragraph" in element: paragraph = element.get("paragraph", {}) para_elements = paragraph.get("elements", []) for pe in para_elements: text_run = pe.get("textRun", {}) if text_run and "content" in text_run: text_content += text_run["content"] # Handle table elements elif "table" in element: table = element.get("table", {}) table_rows = table.get("tableRows", []) for row in table_rows: table_cells = row.get("tableCells", []) for cell in table_cells: cell_content = cell.get("content", []) for cell_element in cell_content: # Recursively extract text from cell content text_content += extract_from_element(cell_element) # Handle section breaks and other structural elements # (They typically don't contain text, but we check for completeness) elif "sectionBreak" in element: # Section breaks don't contain text pass # Handle table of contents elif "tableOfContents" in element: toc = element.get("tableOfContents", {}) toc_content = toc.get("content", []) for toc_element in toc_content: text_content += extract_from_element(toc_element) return text_content # Extract text from body content body_content = doc_data.get("body", {}).get("content", []) full_text = "" for element in body_content: full_text += extract_from_element(element) logger.info( f"[extract_document_text] Extracted {len(full_text)} characters from document" ) return full_text

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/dipseth/google-workspace-unlimited'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

line_parser.py•6.65 KiB