"""HTML export functionality for DeepWiki documentation."""
from __future__ import annotations
import argparse
import asyncio
import json
import re
import shutil
import time
from pathlib import Path
from typing import Any, cast
import markdown
from local_deepwiki.cli_progress import create_progress, is_interactive
from local_deepwiki.export.shared import build_breadcrumb
from local_deepwiki.export.shared import extract_title as _shared_extract_title
from local_deepwiki.export.shared import render_toc, render_toc_entry
from local_deepwiki.export.streaming import (
ExportConfig,
ExportResult,
ProgressCallback,
StreamingExporter,
WikiPage,
WikiPageIterator,
)
from local_deepwiki.export.html_template import STATIC_HTML_TEMPLATE
from local_deepwiki.logging import get_logger
logger = get_logger(__name__)
def render_markdown(content: str) -> str:
"""Render markdown to HTML."""
md = markdown.Markdown(
extensions=[
"fenced_code",
"tables",
"toc",
"nl2br",
]
)
return md.convert(content)
def fix_internal_links(html_content: str) -> str:
"""Convert internal .md links to .html links in rendered HTML.
Args:
html_content: HTML content with potential .md links.
Returns:
HTML content with .md links converted to .html links.
"""
# Match href attributes pointing to .md files (internal links only)
# Excludes http://, https://, and other protocol links
pattern = r'href="((?!https?://|mailto:|#)[^"]*\.md)(#[^"]*)?"'
def replace_link(match: re.Match[str]) -> str:
md_path = match.group(1)
anchor = match.group(2) or ""
html_path = md_path[:-3] + ".html" # Replace .md with .html
return f'href="{html_path}{anchor}"'
return re.sub(pattern, replace_link, html_content)
def add_external_link_targets(html_content: str) -> str:
"""Add target="_blank" to external links for opening in new tab.
Args:
html_content: HTML content with potential external links.
Returns:
HTML content with external links opening in new tabs.
"""
# Match href attributes pointing to http:// or https:// URLs
# that don't already have a target attribute
pattern = r'<a\s+href="(https?://[^"]+)"(?![^>]*target=)'
def add_target(match: re.Match[str]) -> str:
url = match.group(1)
return f'<a href="{url}" target="_blank" rel="noopener noreferrer"'
return re.sub(pattern, add_target, html_content)
def extract_title(md_file: Path) -> str:
"""Extract title from markdown file.
Delegates to ``shared.extract_title``.
"""
return _shared_extract_title(md_file)
class StreamingHtmlExporter(StreamingExporter):
"""Memory-efficient HTML exporter using streaming page iteration.
Writes each page to disk as it's processed, avoiding loading all
pages into memory at once. Suitable for large wikis.
"""
def __init__(
self,
wiki_path: Path,
output_path: Path,
config: ExportConfig | None = None,
*,
no_progress: bool = False,
):
"""Initialize the streaming HTML exporter.
Args:
wiki_path: Path to the .deepwiki directory.
output_path: Output directory for HTML files.
config: Export configuration.
no_progress: If True, disable progress bars.
"""
super().__init__(wiki_path, output_path, config)
self._no_progress = no_progress
async def export(
self, progress_callback: ProgressCallback | None = None
) -> ExportResult:
"""Export wiki to HTML with streaming.
Args:
progress_callback: Optional callback for progress updates.
Returns:
ExportResult with export statistics.
"""
start_time = time.monotonic()
errors: list[str] = []
logger.info(
"Starting streaming HTML export from %s to %s",
self.wiki_path,
self.output_path,
)
# Load TOC for navigation
await asyncio.to_thread(self.load_toc)
# Create output directory
await asyncio.to_thread(self.output_path.mkdir, parents=True, exist_ok=True)
# Copy search.json
search_src = self.wiki_path / "search.json"
if search_src.exists():
await asyncio.to_thread(
shutil.copy, search_src, self.output_path / "search.json"
)
logger.debug("Copied search.json to output directory")
# Get page count for progress
iterator = self.get_page_iterator()
total_pages = iterator.get_page_count()
# Report total pages at start
if progress_callback:
progress_callback(
0, total_pages, f"Starting HTML export ({total_pages} pages)"
)
# Export pages one at a time
exported = 0
async for page in iterator:
try:
await asyncio.to_thread(self._export_wiki_page, page)
exported += 1
if progress_callback:
progress_callback(exported, total_pages, f"Exported {page.path}")
# Release content from memory after writing
page.release_content()
except Exception as e: # noqa: BLE001 — export error boundary: one page failure must not abort entire export
error_msg = f"Failed to export {page.path}: {e}"
logger.warning(error_msg)
errors.append(error_msg)
# Report completion
if progress_callback:
progress_callback(
exported, total_pages, f"HTML export complete ({exported} pages)"
)
duration_ms = int((time.monotonic() - start_time) * 1000)
logger.info(
"Streaming HTML export complete: %d pages in %dms", exported, duration_ms
)
return ExportResult(
pages_exported=exported,
output_path=self.output_path,
duration_ms=duration_ms,
errors=errors,
)
def _export_wiki_page(self, page: WikiPage) -> None:
"""Export a single wiki page to HTML.
Args:
page: WikiPage object with content loaded on demand.
"""
rel_path = page.metadata.relative_path
logger.debug("Exporting page: %s", rel_path)
# Render markdown to HTML, fix internal links, and set external link targets
html_content = render_markdown(page.content)
html_content = fix_internal_links(html_content)
html_content = add_external_link_targets(html_content)
# Calculate depth for relative paths
depth = len(rel_path.parts) - 1
root_path = "../" * depth if depth > 0 else "./"
# Build TOC HTML with correct relative paths
toc_html = self._render_toc(self._toc_entries, str(rel_path), root_path)
# Build breadcrumb HTML
breadcrumb_html = self._build_breadcrumb(rel_path, root_path)
# Calculate search.json path relative to this page
search_json_path = root_path + "search.json"
# Render full HTML
html = STATIC_HTML_TEMPLATE.format(
title=page.title,
toc_html=toc_html,
breadcrumb_html=breadcrumb_html,
content_html=html_content,
search_json_path=search_json_path,
root_path=root_path,
)
# Write output file
output_file = self.output_path / rel_path.with_suffix(".html")
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(html)
@staticmethod
def _render_toc(
entries: list[dict[str, Any]], current_path: str, root_path: str
) -> str:
"""Render TOC entries as HTML. Delegates to shared.render_toc."""
return render_toc(entries, current_path, root_path)
@staticmethod
def _render_toc_entry(
entry: dict[str, Any], current_path: str, root_path: str
) -> str:
"""Render a single TOC entry recursively. Delegates to shared.render_toc_entry."""
return render_toc_entry(entry, current_path, root_path)
def _build_breadcrumb(self, rel_path: Path, root_path: str) -> str:
"""Build breadcrumb navigation HTML. Delegates to shared.build_breadcrumb."""
return build_breadcrumb(rel_path, root_path, self.wiki_path)
class HtmlExporter:
"""Export wiki markdown to static HTML files.
This is the synchronous wrapper class that maintains backwards compatibility.
For large wikis, use StreamingHtmlExporter directly for async streaming export.
"""
def __init__(
self,
wiki_path: Path,
output_path: Path,
*,
no_progress: bool = False,
):
"""Initialize the exporter.
Args:
wiki_path: Path to the .deepwiki directory
output_path: Output directory for HTML files
no_progress: If True, disable progress bars
"""
self.wiki_path = Path(wiki_path)
self.output_path = Path(output_path)
self.toc_entries: list[dict] = []
self._no_progress = no_progress
def export(self) -> int:
"""Export all wiki pages to HTML.
Returns:
Number of pages exported
"""
logger.info(
"Starting HTML export from %s to %s", self.wiki_path, self.output_path
)
# Check if we should use streaming mode
iterator = WikiPageIterator(self.wiki_path)
use_streaming = iterator.should_use_streaming()
if use_streaming:
logger.info("Large wiki detected, using streaming export mode")
return self._export_streaming()
return self._export_standard()
def _export_streaming(self) -> int:
"""Export using streaming mode for large wikis."""
streaming_exporter = StreamingHtmlExporter(
self.wiki_path,
self.output_path,
no_progress=self._no_progress,
)
# Run async export in event loop
with create_progress(disable=self._no_progress) as progress:
task_id = progress.add_task("Exporting HTML (streaming)", total=None)
def progress_callback(current: int, total: int, message: str) -> None:
progress.update(
task_id, total=total, completed=current, description=message
)
loop = asyncio.new_event_loop()
try:
result = loop.run_until_complete(
streaming_exporter.export(progress_callback=progress_callback)
)
finally:
loop.close()
return result.pages_exported
def _export_standard(self) -> int:
"""Export using standard mode (loads all pages in memory)."""
# Load TOC
toc_path = self.wiki_path / "toc.json"
if toc_path.exists():
toc_data = json.loads(toc_path.read_text())
self.toc_entries = toc_data.get("entries", [])
logger.debug("Loaded %s TOC entries", len(self.toc_entries))
# Create output directory
self.output_path.mkdir(parents=True, exist_ok=True)
# Copy search.json
search_src = self.wiki_path / "search.json"
if search_src.exists():
shutil.copy(search_src, self.output_path / "search.json")
logger.debug("Copied search.json to output directory")
# Find all markdown files
md_files = list(self.wiki_path.rglob("*.md"))
# Export with progress bar
exported = 0
with create_progress(disable=self._no_progress) as progress:
task = progress.add_task("Exporting HTML", total=len(md_files))
for md_file in md_files:
rel_path = md_file.relative_to(self.wiki_path)
progress.update(task, description=f"Exporting {rel_path.name}")
self._export_page(md_file, rel_path)
exported += 1
progress.update(task, advance=1)
logger.info("Exported %s pages to HTML", exported)
return exported
def _export_page(self, md_file: Path, rel_path: Path) -> None:
"""Export a single markdown page to HTML.
Args:
md_file: Path to the markdown file
rel_path: Relative path from wiki root
"""
logger.debug("Exporting page: %s", rel_path)
# Read and convert markdown, fix internal links, set external link targets
content = md_file.read_text()
html_content = render_markdown(content)
html_content = fix_internal_links(html_content)
html_content = add_external_link_targets(html_content)
title = extract_title(md_file)
# Calculate depth for relative paths
depth = len(rel_path.parts) - 1
root_path = "../" * depth if depth > 0 else "./"
# Build TOC HTML with correct relative paths
toc_html = self._render_toc(self.toc_entries, str(rel_path), root_path)
# Build breadcrumb HTML
breadcrumb_html = self._build_breadcrumb(rel_path, root_path)
# Calculate search.json path relative to this page
search_json_path = root_path + "search.json"
# Render full HTML
html = STATIC_HTML_TEMPLATE.format(
title=title,
toc_html=toc_html,
breadcrumb_html=breadcrumb_html,
content_html=html_content,
search_json_path=search_json_path,
root_path=root_path,
)
# Write output file
output_file = self.output_path / rel_path.with_suffix(".html")
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(html)
@staticmethod
def _render_toc(entries: list[dict], current_path: str, root_path: str) -> str:
"""Render TOC entries as HTML. Delegates to shared.render_toc."""
return render_toc(entries, current_path, root_path)
@staticmethod
def _render_toc_entry(entry: dict, current_path: str, root_path: str) -> str:
"""Render a single TOC entry recursively. Delegates to shared.render_toc_entry."""
return render_toc_entry(entry, current_path, root_path)
def _build_breadcrumb(self, rel_path: Path, root_path: str) -> str:
"""Build breadcrumb navigation HTML. Delegates to shared.build_breadcrumb."""
return build_breadcrumb(rel_path, root_path, self.wiki_path)
def export_to_html(
wiki_path: str | Path,
output_path: str | Path | None = None,
*,
no_progress: bool = False,
) -> str:
"""Export wiki to static HTML files.
Args:
wiki_path: Path to the .deepwiki directory
output_path: Output directory (default: {wiki_path}_html)
no_progress: If True, disable progress bars
Returns:
Path to the output directory
"""
wiki_path = Path(wiki_path)
if output_path is None:
output_path = wiki_path.parent / f"{wiki_path.name}_html"
else:
output_path = Path(output_path)
logger.info("Exporting wiki from %s to %s", wiki_path, output_path)
exporter = HtmlExporter(wiki_path, output_path, no_progress=no_progress)
count = exporter.export()
logger.info("HTML export complete: %s pages", count)
return f"Exported {count} pages to {output_path}"
def main() -> int:
"""CLI entry point for HTML export."""
parser = argparse.ArgumentParser(
description="Export DeepWiki documentation to static HTML"
)
parser.add_argument(
"wiki_path",
nargs="?",
default=".deepwiki",
help="Path to the .deepwiki directory (default: .deepwiki)",
)
parser.add_argument(
"--output", "-o", help="Output directory (default: {wiki_path}_html)"
)
parser.add_argument(
"--no-progress",
action="store_true",
help="Disable progress bars (for non-interactive use)",
)
args = parser.parse_args()
wiki_path = Path(args.wiki_path).resolve()
if not wiki_path.exists():
print(f"Error: Wiki path does not exist: {wiki_path}")
return 1
output_path = Path(args.output).resolve() if args.output else None
result = export_to_html(wiki_path, output_path, no_progress=args.no_progress)
print(result)
# Print location hint
actual_output = output_path or (wiki_path.parent / f"{wiki_path.name}_html")
print(f"\nOpen {actual_output}/index.html in a browser to view the documentation.")
return 0
if __name__ == "__main__":
exit(main())