"""
Google Docs to Markdown Converter
Converts Google Docs API JSON responses to clean Markdown, preserving:
- Headings (H1-H6, Title, Subtitle)
- Bold, italic, strikethrough, code, links
- Ordered and unordered lists with nesting
- Tables with header row separators
"""
from __future__ import annotations
import logging
from typing import Any
logger = logging.getLogger(__name__)
MONO_FONTS = {"Courier New", "Consolas", "Roboto Mono", "Source Code Pro"}
HEADING_MAP = {
"TITLE": "#",
"SUBTITLE": "##",
"HEADING_1": "#",
"HEADING_2": "##",
"HEADING_3": "###",
"HEADING_4": "####",
"HEADING_5": "#####",
"HEADING_6": "######",
}
def convert_doc_to_markdown(doc: dict[str, Any]) -> str:
"""Convert a Google Docs API document response to markdown.
Args:
doc: The document JSON from docs.documents.get()
Returns:
Markdown string
"""
body = doc.get("body", {})
content = body.get("content", [])
lists_meta = doc.get("lists", {})
lines: list[str] = []
ordered_counters: dict[tuple[str, int], int] = {}
prev_was_list = False
for element in content:
if "paragraph" in element:
para = element["paragraph"]
text = _convert_paragraph_text(para)
if not text.strip():
if prev_was_list:
prev_was_list = False
continue
bullet = para.get("bullet")
if bullet:
list_id = bullet["listId"]
nesting = bullet.get("nestingLevel", 0)
is_ordered = _is_ordered_list(lists_meta, list_id, nesting)
if is_ordered:
key = (list_id, nesting)
ordered_counters[key] = ordered_counters.get(key, 0) + 1
counter = ordered_counters[key]
indent = " " * nesting
lines.append(f"{indent}{counter}. {text}")
else:
indent = " " * nesting
lines.append(f"{indent}- {text}")
prev_was_list = True
else:
if prev_was_list:
ordered_counters.clear()
lines.append("")
prev_was_list = False
style = para.get("paragraphStyle", {})
named_style = style.get("namedStyleType", "NORMAL_TEXT")
prefix = HEADING_MAP.get(named_style, "")
if prefix:
lines.append(f"{prefix} {text}")
lines.append("")
else:
lines.append(text)
lines.append("")
elif "table" in element:
if prev_was_list:
ordered_counters.clear()
lines.append("")
prev_was_list = False
table_md = _convert_table(element["table"])
lines.append(table_md)
lines.append("")
result = "\n".join(lines).rstrip("\n") + "\n"
return result
def _convert_paragraph_text(para: dict[str, Any]) -> str:
"""Convert paragraph elements to inline markdown text."""
parts: list[str] = []
for elem in para.get("elements", []):
if "textRun" in elem:
parts.append(_convert_text_run(elem["textRun"]))
return "".join(parts).strip()
def _convert_text_run(text_run: dict[str, Any]) -> str:
"""Convert a single text run to markdown."""
content = text_run.get("content", "")
style = text_run.get("textStyle", {})
text = content.rstrip("\n")
if not text:
return ""
return _apply_text_style(text, style)
def _apply_text_style(text: str, style: dict[str, Any]) -> str:
"""Apply markdown formatting based on text style."""
link = style.get("link", {})
url = link.get("url")
font_family = style.get("weightedFontFamily", {}).get("fontFamily", "")
if font_family in MONO_FONTS:
return f"`{text}`"
bold = style.get("bold", False)
italic = style.get("italic", False)
strikethrough = style.get("strikethrough", False)
if bold and italic:
text = f"***{text}***"
elif bold:
text = f"**{text}**"
elif italic:
text = f"*{text}*"
if strikethrough:
text = f"~~{text}~~"
if url:
text = f"[{text}]({url})"
return text
def _is_ordered_list(lists_meta: dict[str, Any], list_id: str, nesting: int) -> bool:
"""Check if a list at a given nesting level is ordered."""
list_info = lists_meta.get(list_id, {})
nesting_levels = list_info.get("listProperties", {}).get("nestingLevels", [])
if nesting < len(nesting_levels):
level = nesting_levels[nesting]
glyph = level.get("glyphType", "")
return glyph not in ("", "GLYPH_TYPE_UNSPECIFIED")
return False
def _convert_table(table: dict[str, Any]) -> str:
"""Convert a table element to markdown."""
rows = table.get("tableRows", [])
if not rows:
return ""
md_rows: list[str] = []
for i, row in enumerate(rows):
cells: list[str] = []
for cell in row.get("tableCells", []):
cell_text = _extract_cell_text(cell)
cells.append(cell_text)
md_rows.append("| " + " | ".join(cells) + " |")
if i == 0:
sep = "| " + " | ".join("---" for _ in cells) + " |"
md_rows.append(sep)
return "\n".join(md_rows)
def _extract_cell_text(cell: dict[str, Any]) -> str:
"""Extract text from a table cell."""
parts: list[str] = []
for content_elem in cell.get("content", []):
if "paragraph" in content_elem:
text = _convert_paragraph_text(content_elem["paragraph"])
if text.strip():
parts.append(text.strip())
cell_text = " ".join(parts)
return cell_text.replace("|", "\\|")
def format_comments_inline(markdown: str, comments: list[dict[str, Any]]) -> str:
"""Insert footnote-style comment annotations inline in markdown.
For each comment, finds the anchor text in the markdown and inserts
a footnote reference. Unmatched comments go to an appendix at the bottom.
"""
if not comments:
return markdown
footnotes: list[str] = []
unmatched: list[dict[str, Any]] = []
for i, comment in enumerate(comments, 1):
ref = f"[^c{i}]"
anchor = comment.get("anchor_text", "")
if anchor and anchor in markdown:
markdown = markdown.replace(anchor, anchor + ref, 1)
footnotes.append(_format_footnote(i, comment))
else:
unmatched.append(comment)
if footnotes:
markdown = markdown.rstrip("\n") + "\n\n" + "\n".join(footnotes) + "\n"
if unmatched:
appendix = format_comments_appendix(unmatched)
if appendix.strip():
markdown = markdown.rstrip("\n") + "\n\n" + appendix
return markdown
def _format_footnote(num: int, comment: dict[str, Any]) -> str:
"""Format a single footnote."""
lines = [f"[^c{num}]: **{comment['author']}**: {comment['content']}"]
for reply in comment.get("replies", []):
lines.append(f" - **{reply['author']}**: {reply['content']}")
return "\n".join(lines)
def format_comments_appendix(comments: list[dict[str, Any]]) -> str:
"""Format comments as an appendix section with blockquoted anchors."""
if not comments:
return ""
lines = ["## Comments", ""]
for comment in comments:
resolved_tag = " *(Resolved)*" if comment.get("resolved") else ""
anchor = comment.get("anchor_text", "")
if anchor:
lines.append(f"> {anchor}")
lines.append("")
lines.append(f"- **{comment['author']}**: {comment['content']}{resolved_tag}")
for reply in comment.get("replies", []):
lines.append(f" - **{reply['author']}**: {reply['content']}")
lines.append("")
return "\n".join(lines)
def parse_drive_comments(
response: dict[str, Any], include_resolved: bool = False
) -> list[dict[str, Any]]:
"""Parse Drive API comments response into structured dicts.
Args:
response: Raw JSON from drive.comments.list()
include_resolved: Whether to include resolved comments
Returns:
List of comment dicts with keys: author, content, anchor_text,
replies, resolved
"""
results = []
for comment in response.get("comments", []):
if not include_resolved and comment.get("resolved", False):
continue
anchor_text = comment.get("quotedFileContent", {}).get("value", "")
replies = [
{
"author": r.get("author", {}).get("displayName", "Unknown"),
"content": r.get("content", ""),
}
for r in comment.get("replies", [])
]
results.append(
{
"author": comment.get("author", {}).get("displayName", "Unknown"),
"content": comment.get("content", ""),
"anchor_text": anchor_text,
"replies": replies,
"resolved": comment.get("resolved", False),
}
)
return results