Local DeepWiki MCP Server

Overview Schema Related Servers Score Discussions

pdf.py•16.7 KiB

"""PDF export functionality for DeepWiki documentation.""" from __future__ import annotations import asyncio import base64 import shutil import tempfile import time from pathlib import Path from typing import Any import markdown try: from weasyprint import CSS, HTML except ImportError: CSS = None # fallback if weasyprint not installed HTML = None from local_deepwiki.export.mermaid_renderer import ( extract_mermaid_blocks, is_mmdc_available, render_mermaid_to_png, render_mermaid_to_svg, ) from local_deepwiki.export.pdf_styles import PDF_HTML_TEMPLATE, PRINT_CSS from local_deepwiki.export.shared import extract_title as _shared_extract_title from local_deepwiki.export.streaming import ( ExportConfig, ExportResult, ProgressCallback, StreamingExporter, WikiPage, ) from local_deepwiki.logging import get_logger logger = get_logger(__name__) def _require_weasyprint() -> None: """Raise a helpful error if WeasyPrint is not installed.""" if HTML is None: raise ImportError( "WeasyPrint is required for PDF export but is not installed.\n" "Install with: uv pip install weasyprint\n" "See: https://doc.courtbouillon.org/weasyprint/stable/first_steps.html" ) def render_markdown_for_pdf(content: str, render_mermaid: bool = True) -> str: """Render markdown to HTML suitable for PDF. Args: content: Markdown content. render_mermaid: If True, attempt to render mermaid diagrams using CLI. Falls back to placeholder if CLI is not available. Returns: HTML string. """ processed_content = content # Process mermaid blocks if render_mermaid and is_mmdc_available(): # Try to render mermaid diagrams to PNG (better font support than SVG) mermaid_blocks = extract_mermaid_blocks(content) for full_block, diagram_code in mermaid_blocks: png_bytes = render_mermaid_to_png(diagram_code) if png_bytes: # Embed PNG as base64 data URI b64_data = base64.b64encode(png_bytes).decode("ascii") img_tag = f'<img src="data:image/png;base64,{b64_data}" alt="Mermaid diagram">' replacement = f'<div class="mermaid-diagram">{img_tag}</div>' processed_content = processed_content.replace(full_block, replacement) else: # Fall back to placeholder on render failure replacement = ( '<div class="mermaid-note">' "[Diagram rendering failed - view in HTML version]" "</div>" ) processed_content = processed_content.replace(full_block, replacement) else: # No mermaid CLI - replace with placeholder notes lines = processed_content.split("\n") in_mermaid = False result_lines = [] for line in lines: if line.strip() == "```mermaid": in_mermaid = True result_lines.append( '<div class="mermaid-note">' "[Diagram not available in PDF - view in HTML version]" "</div>" ) elif in_mermaid and line.strip() == "```": in_mermaid = False elif not in_mermaid: result_lines.append(line) processed_content = "\n".join(result_lines) md = markdown.Markdown( extensions=[ "fenced_code", "tables", "toc", ] ) return md.convert(processed_content) def extract_title(md_file: Path) -> str: """Extract title from markdown file. Delegates to ``shared.extract_title``. Args: md_file: Path to markdown file. Returns: Extracted title or filename-based title. """ return _shared_extract_title(md_file) class StreamingPdfExporter(StreamingExporter): """Memory-efficient PDF exporter using streaming page iteration. Processes pages in batches, writes intermediate PDFs to temp files, then merges them at the end. Suitable for large wikis to avoid OOM. """ def __init__( self, wiki_path: Path, output_path: Path, config: ExportConfig | None = None, *, no_progress: bool = False, ): """Initialize the streaming PDF exporter. Args: wiki_path: Path to the .deepwiki directory. output_path: Output path for PDF file(s). config: Export configuration. no_progress: If True, disable progress bars. """ super().__init__(wiki_path, output_path, config) self._no_progress = no_progress async def export( self, progress_callback: ProgressCallback | None = None ) -> ExportResult: """Export wiki to PDF with streaming/batched processing. Args: progress_callback: Optional callback for progress updates. Returns: ExportResult with export statistics. """ start_time = time.monotonic() errors: list[str] = [] logger.info( "Starting streaming PDF export from %s to %s", self.wiki_path, self.output_path, ) # Load TOC for ordering await asyncio.to_thread(self.load_toc) # Get page count for progress iterator = self.get_page_iterator() total_pages = iterator.get_page_count() # Report total pages at start if progress_callback: progress_callback( 0, total_pages, f"Starting PDF export ({total_pages} pages)" ) # Determine output file output_file = self.output_path if output_file.is_dir(): output_file = output_file / "documentation.pdf" await asyncio.to_thread(output_file.parent.mkdir, parents=True, exist_ok=True) # Process pages in batches and create intermediate PDFs batch_size = self.config.batch_size batch_num = 0 pages_processed = 0 temp_pdfs: list[Path] = [] with tempfile.TemporaryDirectory() as temp_dir: temp_path = Path(temp_dir) batch_pages: list[WikiPage] = [] async for page in iterator: try: batch_pages.append(page) pages_processed += 1 if progress_callback: progress_callback( pages_processed, total_pages, f"Processing page {pages_processed} of {total_pages}: {page.path}", ) # When batch is full, render to intermediate PDF if len(batch_pages) >= batch_size: batch_pdf = temp_path / f"batch_{batch_num:04d}.pdf" await asyncio.to_thread( self._render_batch_to_pdf, batch_pages, batch_pdf, batch_num == 0, ) temp_pdfs.append(batch_pdf) # Release memory for p in batch_pages: p.release_content() batch_pages = [] batch_num += 1 except Exception as e: # noqa: BLE001 — export error boundary: one page failure must not abort entire PDF export error_msg = f"Failed to process {page.path}: {e}" logger.warning(error_msg) errors.append(error_msg) # Process remaining pages if batch_pages: batch_pdf = temp_path / f"batch_{batch_num:04d}.pdf" await asyncio.to_thread( self._render_batch_to_pdf, batch_pages, batch_pdf, batch_num == 0 ) temp_pdfs.append(batch_pdf) for p in batch_pages: p.release_content() # Merge all batch PDFs into final output if progress_callback: progress_callback( pages_processed, total_pages, "Merging PDF batches..." ) if len(temp_pdfs) == 1: # Only one batch, just copy it await asyncio.to_thread(shutil.copy, temp_pdfs[0], output_file) elif len(temp_pdfs) > 1: # Multiple batches, need to merge self._merge_pdfs(temp_pdfs, output_file) else: # No pages - create empty PDF self._create_empty_pdf(output_file) # Report completion if progress_callback: progress_callback( pages_processed, total_pages, f"PDF export complete ({pages_processed} pages)", ) duration_ms = int((time.monotonic() - start_time) * 1000) logger.info( "Streaming PDF export complete: %d pages in %d batches, %dms", pages_processed, len(temp_pdfs), duration_ms, ) return ExportResult( pages_exported=pages_processed, output_path=output_file, duration_ms=duration_ms, errors=errors, ) async def export_separate( self, progress_callback: ProgressCallback | None = None ) -> ExportResult: """Export each wiki page as a separate PDF with streaming. Args: progress_callback: Optional callback for progress updates. Returns: ExportResult with export statistics. """ start_time = time.monotonic() errors: list[str] = [] logger.info("Starting streaming separate PDF export from %s", self.wiki_path) # Determine output directory output_dir = self.output_path if output_dir.suffix == ".pdf": output_dir = output_dir.parent / output_dir.stem await asyncio.to_thread(output_dir.mkdir, parents=True, exist_ok=True) # Get page count for progress iterator = self.get_page_iterator() total_pages = iterator.get_page_count() # Report total pages at start if progress_callback: progress_callback( 0, total_pages, f"Starting separate PDF export ({total_pages} pages)" ) exported = 0 async for page in iterator: try: rel_path = page.metadata.relative_path output_file = output_dir / rel_path.with_suffix(".pdf") await asyncio.to_thread( output_file.parent.mkdir, parents=True, exist_ok=True ) await asyncio.to_thread(self._export_single_page, page, output_file) exported += 1 if progress_callback: progress_callback( exported, total_pages, f"Exported page {exported} of {total_pages}: {page.path}", ) # Release content from memory page.release_content() except Exception as e: # noqa: BLE001 — export error boundary: one page failure must not abort entire PDF export error_msg = f"Failed to export {page.path}: {e}" logger.warning(error_msg) errors.append(error_msg) # Report completion if progress_callback: progress_callback( exported, total_pages, f"Separate PDF export complete ({exported} pages)", ) duration_ms = int((time.monotonic() - start_time) * 1000) logger.info( "Streaming separate PDF export complete: %d pages in %dms", exported, duration_ms, ) return ExportResult( pages_exported=exported, output_path=output_dir, duration_ms=duration_ms, errors=errors, ) def _render_batch_to_pdf( self, pages: list[WikiPage], output_path: Path, include_toc: bool = False ) -> None: """Render a batch of pages to a PDF file. Args: pages: List of WikiPage objects to render. output_path: Path for the output PDF. include_toc: If True, include TOC at the start (first batch only). """ parts = [] if include_toc: # Add title page with TOC for first batch parts.append("<h1>Documentation</h1>") parts.append("<h2>Table of Contents</h2>") parts.append(self._build_streaming_toc_html()) parts.append('<div class="page-break"></div>') for i, page in enumerate(pages): content = page.content html_content = render_markdown_for_pdf(content) parts.append(html_content) # Add page break between pages (except last) if i < len(pages) - 1: parts.append('<div class="page-break"></div>') combined_content = "\n".join(parts) full_html = PDF_HTML_TEMPLATE.format( title="Documentation", content=combined_content, ) _require_weasyprint() html_doc = HTML(string=full_html) css = CSS(string=PRINT_CSS) html_doc.write_pdf(output_path, stylesheets=[css]) def _build_streaming_toc_html(self) -> str: """Build TOC HTML from loaded TOC entries.""" parts = ['<div class="toc">'] self._add_toc_entries_html(self._toc_entries, parts, 0) parts.append("</div>") return "\n".join(parts) def _add_toc_entries_html( self, entries: list[dict[str, Any]], parts: list[str], depth: int ) -> None: """Recursively add TOC entries to HTML parts.""" for entry in entries: title = entry.get("title", "") indent = " " * depth parts.append(f'<div class="toc-item">{indent}{title}</div>') if "children" in entry: self._add_toc_entries_html(entry["children"], parts, depth + 1) @staticmethod def _export_single_page(page: WikiPage, output_file: Path) -> None: """Export a single wiki page to PDF. Args: page: WikiPage object to export. output_file: Output PDF path. """ logger.debug("Exporting page: %s", page.path) content = page.content html_content = render_markdown_for_pdf(content) full_html = PDF_HTML_TEMPLATE.format( title=page.title, content=html_content, ) _require_weasyprint() html_doc = HTML(string=full_html) css = CSS(string=PRINT_CSS) html_doc.write_pdf(output_file, stylesheets=[css]) @staticmethod def _merge_pdfs(pdf_files: list[Path], output_path: Path) -> None: """Merge multiple PDF files into one. Uses pypdf if available, otherwise concatenates using WeasyPrint. Args: pdf_files: List of PDF file paths to merge. output_path: Output path for merged PDF. """ try: # Try using pypdf for efficient merging from pypdf import PdfWriter writer = PdfWriter() for pdf_file in pdf_files: writer.append(str(pdf_file)) writer.write(str(output_path)) writer.close() logger.debug("Merged %s PDFs using pypdf", len(pdf_files)) except ImportError: # Fallback: Copy first PDF only — remaining batches are lost logger.warning( "pypdf not available for PDF merging. " "Only %d of %d PDF batches included in output. " "Install pypdf (`pip install pypdf`) for complete multi-batch merging.", 1, len(pdf_files), ) shutil.copy(pdf_files[0], output_path) @staticmethod def _create_empty_pdf(output_path: Path) -> None: """Create an empty PDF file. Args: output_path: Path for the output PDF. """ empty_html = PDF_HTML_TEMPLATE.format( title="Documentation", content="<p>No pages to export.</p>", ) _require_weasyprint() html_doc = HTML(string=empty_html) css = CSS(string=PRINT_CSS) html_doc.write_pdf(output_path, stylesheets=[css]) from local_deepwiki.export.pdf_sync import ( # noqa: E402 PdfExporter, export_to_pdf, main, ) # Re-export for backward compatibility __all__ = [ "_require_weasyprint", "render_markdown_for_pdf", "extract_title", "StreamingPdfExporter", "PdfExporter", "export_to_pdf", "main", ] if __name__ == "__main__": main()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/UrbanDiver/local-deepwiki-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

pdf.py•16.7 KiB