Local DeepWiki MCP Server

Overview Schema Related Servers Score Discussions

wiki_files.py•30.3 KiB

"""File documentation generation for wiki.""" import asyncio import re import time from dataclasses import dataclass from pathlib import Path from typing import TYPE_CHECKING, Awaitable, Callable from local_deepwiki.config import Config from local_deepwiki.core.git_utils import ( GitRepoInfo, build_source_url, format_blame_date, get_file_entity_blame, get_repo_info, ) from local_deepwiki.core.vectorstore import VectorStore from local_deepwiki.generators.api_docs import get_file_api_docs from local_deepwiki.generators.callgraph import get_file_call_graph, get_file_callers from local_deepwiki.generators.context_builder import build_file_context, format_context_for_llm from local_deepwiki.generators.crosslinks import EntityRegistry from local_deepwiki.generators.diagrams import generate_class_diagram from local_deepwiki.generators.test_examples import get_file_examples from local_deepwiki.logging import get_logger from local_deepwiki.models import ( ChunkType, CodeChunk, FileInfo, IndexStatus, ProgressCallback, WikiPage, ) from local_deepwiki.providers.base import LLMProvider if TYPE_CHECKING: from local_deepwiki.generators.progress_tracker import GenerationProgress from local_deepwiki.generators.wiki_status import WikiStatusManager logger = get_logger(__name__) def _get_syntax_lang(language: str | None) -> str: """Get syntax highlighting language string. Args: language: Programming language name. Returns: Language string for markdown code blocks. """ lang_map = { "python": "python", "javascript": "javascript", "typescript": "typescript", "tsx": "tsx", "go": "go", "rust": "rust", "java": "java", "c": "c", "cpp": "cpp", "swift": "swift", "ruby": "ruby", "php": "php", "kotlin": "kotlin", "csharp": "csharp", } return lang_map.get(language or "", "") def _create_source_details( chunk: CodeChunk, syntax_lang: str, github_url: str | None = None ) -> str: """Create a collapsible source code block for a chunk. Args: chunk: The code chunk. syntax_lang: Syntax highlighting language. github_url: Optional GitHub URL to link to source. Returns: Markdown details block with source code. """ if github_url: summary = f'View Source (lines {chunk.start_line}-{chunk.end_line}) | <a href="{github_url}">GitHub</a>' else: summary = f"View Source (lines {chunk.start_line}-{chunk.end_line})" return f"""<details> <summary>{summary}</summary> ```{syntax_lang} {chunk.content} ``` </details> """ @dataclass class _ChunkMaps: """Maps for looking up chunks by name.""" chunk_map: dict[str, CodeChunk] class_map: dict[str, CodeChunk] all_chunk_ids: set[str] def _build_chunk_maps(chunks: list[CodeChunk]) -> _ChunkMaps: """Build lookup maps for chunks by name. Args: chunks: List of code chunks. Returns: ChunkMaps with name-to-chunk mappings. """ chunk_map: dict[str, CodeChunk] = {} class_map: dict[str, CodeChunk] = {} all_chunk_ids: set[str] = set() for chunk in chunks: if chunk.name and chunk.chunk_type in ( ChunkType.CLASS, ChunkType.FUNCTION, ChunkType.METHOD, ): all_chunk_ids.add(chunk.id) chunk_map[chunk.name] = chunk if chunk.parent_name: qualified_name = f"{chunk.parent_name}.{chunk.name}" chunk_map[qualified_name] = chunk if chunk.chunk_type == ChunkType.CLASS: class_map[chunk.name] = chunk return _ChunkMaps(chunk_map, class_map, all_chunk_ids) def _extract_entity_from_heading(line: str) -> tuple[str | None, bool]: """Extract entity name from a markdown heading. Args: line: Heading line like "#### `name`" or "### class `name`". Returns: Tuple of (entity_name, is_class_heading). """ start = line.find("`") + 1 end = line.find("`", start) if start <= 0 or end <= start: return None, False entity_name = line[start:end] # Normalize: strip signature if "(" in entity_name: entity_name = entity_name.split("(")[0] # Check if class heading is_class = entity_name.startswith("class ") if is_class: entity_name = entity_name[6:].strip() return entity_name, is_class def _find_matching_chunk( entity_name: str, current_class: str | None, maps: _ChunkMaps, ) -> CodeChunk | None: """Find the chunk that matches an entity name. Args: entity_name: Name of the entity to find. current_class: Current class context, if any. maps: Chunk lookup maps. Returns: Matching chunk or None. """ matched_chunk: CodeChunk | None = None # Try qualified name first for methods if current_class and entity_name != current_class: qualified_name = f"{current_class}.{entity_name}" matched_chunk = maps.chunk_map.get(qualified_name) # Try simple name if matched_chunk is None: candidate = maps.chunk_map.get(entity_name) if candidate is not None: if candidate.parent_name is None or candidate.parent_name == current_class: matched_chunk = candidate # Fallback to class source for unmatched methods if matched_chunk is None and current_class and entity_name != current_class: matched_chunk = maps.class_map.get(current_class) return matched_chunk def _find_insertion_point( lines: list[str], start_idx: int, result_lines: list[str], chunk: CodeChunk, syntax_lang: str, chunk_url: str | None, ) -> int: """Find where to insert source code and add it. Args: lines: All content lines. start_idx: Starting line index. result_lines: Result lines to append to. chunk: Chunk to insert source for. syntax_lang: Syntax highlighting language. chunk_url: Optional GitHub URL. Returns: New line index to continue from. """ j = start_idx found_returns = False while j < len(lines): next_line = lines[j] # Stop at next heading of same or higher level if next_line.startswith(("#### ", "### ", "## ")): if not found_returns: result_lines.append("") result_lines.append(_create_source_details(chunk, syntax_lang, chunk_url)) return j - 1 # Track if we found Returns if next_line.startswith("**Returns:**"): found_returns = True result_lines.append(lines[j]) j += 1 # Skip blank lines after Returns while j < len(lines) and lines[j].strip() == "": result_lines.append(lines[j]) j += 1 # Insert source code here result_lines.append("") result_lines.append(_create_source_details(chunk, syntax_lang, chunk_url)) return j - 1 result_lines.append(lines[j]) j += 1 # Reached end of file if not found_returns: result_lines.append("") result_lines.append(_create_source_details(chunk, syntax_lang, chunk_url)) return j - 1 def _append_unused_chunks( result_lines: list[str], chunks: list[CodeChunk], all_chunk_ids: set[str], used_chunks: set[str], syntax_lang: str, get_url: Callable[[CodeChunk], str | None], ) -> None: """Append unused chunks as additional source code section. Args: result_lines: Lines to append to. chunks: All chunks. all_chunk_ids: Set of all chunk IDs. used_chunks: Set of already-used chunk IDs. syntax_lang: Syntax highlighting language. get_url: Function to get GitHub URL for a chunk. """ unused = [c for c in chunks if c.id in all_chunk_ids and c.id not in used_chunks] if not unused: return result_lines.append("") result_lines.append("## Additional Source Code") result_lines.append("") result_lines.append( "Source code for functions and methods not listed in the API Reference above." ) result_lines.append("") for chunk in sorted(unused, key=lambda c: c.start_line): heading = "###" if chunk.chunk_type == ChunkType.CLASS else "####" result_lines.append(f"{heading} `{chunk.name}`") result_lines.append("") result_lines.append(_create_source_details(chunk, syntax_lang, get_url(chunk))) result_lines.append("") def _inject_inline_source_code( content: str, chunks: list[CodeChunk], language: str | None, repo_info: GitRepoInfo | None = None, ) -> str: """Inject collapsible source code after each function/class in the API Reference. Args: content: The markdown content to process. chunks: List of code chunks from the file. language: Programming language for syntax highlighting. repo_info: Optional git repo info for GitHub links. Returns: Content with inline source code blocks injected. """ maps = _build_chunk_maps(chunks) if not maps.chunk_map: return content syntax_lang = _get_syntax_lang(language) used_chunks: set[str] = set() def get_chunk_url(chunk: CodeChunk) -> str | None: if repo_info is None: return None return build_source_url(repo_info, chunk.file_path, chunk.start_line, chunk.end_line) lines = content.split("\n") result_lines: list[str] = [] current_class: str | None = None i = 0 while i < len(lines): line = lines[i] result_lines.append(line) # Track class context if line.startswith("### class `"): entity, _ = _extract_entity_from_heading(line) if entity: current_class = entity # Look for API Reference headings if line.startswith(("#### `", "### `", "### class `")): entity_name, is_class = _extract_entity_from_heading(line) if entity_name: if is_class: current_class = entity_name matched_chunk = _find_matching_chunk(entity_name, current_class, maps) if matched_chunk is not None: used_chunks.add(matched_chunk.id) i = _find_insertion_point( lines, i + 1, result_lines, matched_chunk, syntax_lang, get_chunk_url(matched_chunk) ) i += 1 _append_unused_chunks( result_lines, chunks, maps.all_chunk_ids, used_chunks, syntax_lang, get_chunk_url ) return "\n".join(result_lines) async def _gather_file_context( file_info: FileInfo, index_status: IndexStatus, vector_store: VectorStore, ) -> tuple[list[CodeChunk], str, str] | None: """Collect chunks, imports, and related context for the file. Args: file_info: File status information. index_status: Index status with repo information. vector_store: Vector store with indexed code. Returns: Tuple of (chunks_list, context_text, rich_context_text) or None if no content. """ # Get all chunks for this file using direct lookup (efficient scalar index) file_chunks = await vector_store.get_chunks_by_file(file_info.path) if not file_chunks: return None # No content to document # Build context from chunks context_parts = [] for chunk in file_chunks[:15]: # Limit context size context_parts.append( f"Type: {chunk.chunk_type.value}\n" f"Name: {chunk.name}\n" f"Lines: {chunk.start_line}-{chunk.end_line}\n" f"```\n{chunk.content[:600]}\n```" ) context = "\n\n".join(context_parts) # Build rich context with imports, callers, and related files rich_context = await build_file_context( file_path=file_info.path, chunks=file_chunks, repo_path=Path(index_status.repo_path), vector_store=vector_store, ) rich_context_text = format_context_for_llm(rich_context) return file_chunks, context, rich_context_text def _build_llm_prompt( file_info: FileInfo, context: str, rich_context_text: str, ) -> str: """Construct the LLM prompt with all context. Args: file_info: File status information. context: Code context text. rich_context_text: Rich context with imports and callers. Returns: The formatted LLM prompt string. """ return f"""Generate documentation for the file '{file_info.path}' based on the code and context provided. Language: {file_info.language} Total code chunks: {file_info.chunk_count} {rich_context_text} ## Code Contents {context} Generate documentation that includes: 1. **File Overview**: Purpose of this file based on the code shown and its dependencies 2. **Classes**: Document each class visible in the code with its purpose and key methods 3. **Functions**: Document each function with parameters and return values as shown 4. **Integration**: How this file fits into the larger codebase (based on imports and callers) 5. **Usage Examples**: Show how to use the components (based on their actual signatures) CRITICAL CONSTRAINTS: - ONLY document classes, methods, and functions that appear in the code above - Do NOT invent additional methods or parameters not shown - Do NOT fabricate usage examples with APIs not visible in the code - Write class names as plain text (e.g., "The WikiGenerator class") for cross-linking - Use the dependency and caller information to explain integration, but don't fabricate details - Only use backticks for actual code snippets Format as markdown with clear sections. Do NOT include mermaid class diagrams - they will be auto-generated.""" async def _generate_and_format_doc( prompt: str, llm: LLMProvider, system_prompt: str, ) -> str: """Call LLM and format the response. Args: prompt: The LLM prompt. llm: LLM provider for generation. system_prompt: System prompt for LLM. Returns: The formatted documentation content. """ content = await llm.generate(prompt, system_prompt=system_prompt) # Strip any LLM-generated class diagram sections (we add our own) content = re.sub( r"\n*##\s*Class\s*Diagram\s*\n+```mermaid\s*\n+classDiagram.*?```", "", content, flags=re.DOTALL | re.IGNORECASE, ) return content def _generate_file_enrichments( content: str, abs_file_path: Path, repo_path: Path, file_path: str, all_file_chunks: list[CodeChunk], ) -> str: """Generate diagrams, call graphs, examples, and blame info. Args: content: The base documentation content. abs_file_path: Absolute path to the source file. repo_path: Path to the repository root. file_path: Relative path to the source file. all_file_chunks: All code chunks from the file. Returns: The enriched documentation content. """ # Generate API reference section with type signatures if abs_file_path.exists(): api_docs = get_file_api_docs(abs_file_path) if api_docs: content += "\n\n## API Reference\n\n" + api_docs # Generate class diagram if file has classes class_diagram = generate_class_diagram(all_file_chunks) if class_diagram: content += "\n\n## Class Diagram\n\n" + class_diagram # Generate call graph diagram and used-by information if abs_file_path.exists(): call_graph = get_file_call_graph(abs_file_path, repo_path) if call_graph: content += "\n\n## Call Graph\n\n```mermaid\n" + call_graph + "\n```" # Add "Used by" section showing callers for each function callers_map = get_file_callers(abs_file_path, repo_path) if callers_map: used_by_lines = [ "## Used By", "", "Functions and methods in this file and their callers:", "", ] for callee in sorted(callers_map.keys()): callers = callers_map[callee] if callers: caller_list = ", ".join(f"`{c}`" for c in sorted(callers)) used_by_lines.append(f"- **`{callee}`**: called by {caller_list}") if len(used_by_lines) > 4: # More than just the header content += "\n\n" + "\n".join(used_by_lines) # Add usage examples from test files entity_names = [chunk.name for chunk in all_file_chunks if chunk.name and len(chunk.name) > 2] if entity_names: examples_md = get_file_examples( source_file=abs_file_path, repo_root=repo_path, entity_names=entity_names, max_examples=5, ) if examples_md: content += "\n\n" + examples_md # Add git blame "Last Modified" section blame_section = _generate_blame_section( repo_path=repo_path, file_path=file_path, chunks=all_file_chunks, ) if blame_section: content += "\n\n" + blame_section return content async def generate_single_file_doc( file_info: FileInfo, index_status: IndexStatus, vector_store: VectorStore, llm: LLMProvider, system_prompt: str, status_manager: "WikiStatusManager", entity_registry: EntityRegistry, config: Config, full_rebuild: bool, ) -> tuple[WikiPage | None, bool]: """Generate documentation for a single source file. Coordinates the documentation generation pipeline: 1. Check if regeneration is needed 2. Gather file context (chunks, imports, related context) 3. Build LLM prompt with all context 4. Generate and format documentation via LLM 5. Add enrichments (diagrams, call graphs, examples, blame) 6. Inject inline source code and register entities Args: file_info: File status information. index_status: Index status with repo information. vector_store: Vector store with indexed code. llm: LLM provider for generation. system_prompt: System prompt for LLM. status_manager: Wiki status manager for incremental updates. entity_registry: Entity registry for cross-linking. config: Configuration. full_rebuild: If True, regenerate even if unchanged. Returns: Tuple of (WikiPage or None, was_skipped). Returns (None, False) if file should be skipped entirely. Returns (page, True) if existing page was reused. Returns (page, False) if new page was generated. """ file_path = Path(file_info.path) repo_path = Path(index_status.repo_path) # Create nested path structure: files/module/filename.md parts = file_path.parts if len(parts) > 1: wiki_path = f"files/{'/'.join(parts[:-1])}/{file_path.stem}.md" else: wiki_path = f"files/{file_path.stem}.md" source_files = [file_info.path] # Check if this file page needs regeneration if not full_rebuild and not status_manager.needs_regeneration(wiki_path, source_files): existing_page = await status_manager.load_existing_page(wiki_path) if existing_page is not None: # Still need to register entities for cross-linking all_file_chunks = await vector_store.get_chunks_by_file(file_info.path) entity_registry.register_from_chunks(all_file_chunks, wiki_path) status_manager.record_page_status(existing_page, source_files) return existing_page, True # Skipped (reused existing) # Step 1: Gather file context (chunks, imports, related context) context_result = await _gather_file_context( file_info=file_info, index_status=index_status, vector_store=vector_store, ) if context_result is None: return None, False # No content to document file_chunks, context, rich_context_text = context_result # Step 2: Build the LLM prompt prompt = _build_llm_prompt( file_info=file_info, context=context, rich_context_text=rich_context_text, ) # Step 3: Generate and format the documentation content = await _generate_and_format_doc( prompt=prompt, llm=llm, system_prompt=system_prompt, ) # Step 4: Generate enrichments (diagrams, call graphs, examples, blame) abs_file_path = repo_path / file_info.path content = _generate_file_enrichments( content=content, abs_file_path=abs_file_path, repo_path=repo_path, file_path=file_info.path, all_file_chunks=file_chunks, ) # Inject inline source code after each function/class in API Reference lang_str = file_info.language.value if file_info.language else None repo_info = get_repo_info(repo_path) content = _inject_inline_source_code(content, file_chunks, lang_str, repo_info) # Register entities for cross-linking entity_registry.register_from_chunks(file_chunks, wiki_path) page = WikiPage( path=wiki_path, title=f"{file_path.name}", content=content, generated_at=time.time(), ) status_manager.record_page_status(page, source_files) return page, False # Generated new # Type alias for async write callback WriteCallback = Callable[[WikiPage], Awaitable[None]] def _is_test_file(path: str) -> bool: """Check if a file is a test file in tests/ directory. Note: Don't skip test_*.py in src/ (e.g., test_examples.py is a source file). Args: path: File path to check. Returns: True if file is in tests/ directory. """ parts = path.split("/") return "tests" in parts def _filter_significant_files(files: list[FileInfo], max_files: int) -> list[FileInfo]: """Filter and limit files for documentation generation. Args: files: All indexed files. max_files: Maximum files to document (0 = unlimited). Returns: Filtered and prioritized list of files. """ # Filter: skip __init__.py, test files, and files with minimal content significant = [ f for f in files if not f.path.endswith("__init__.py") and not _is_test_file(f.path) and f.chunk_count >= 2 ] # Limit and prioritize by complexity (chunk count) if max_files > 0 and len(significant) > max_files: significant = sorted(significant, key=lambda x: x.chunk_count, reverse=True)[:max_files] return significant def _create_files_index_page( pages: list[WikiPage], significant_files: list[FileInfo], status_manager: "WikiStatusManager", ) -> WikiPage: """Create the files index page. Args: pages: All generated file pages. significant_files: Files that were documented. status_manager: Status manager for recording. Returns: Files index WikiPage. """ all_file_paths = [f.path for f in significant_files] files_index = WikiPage( path="files/index.md", title="Source Files", content=_generate_files_index(pages), generated_at=time.time(), ) status_manager.record_page_status(files_index, all_file_paths) return files_index async def generate_file_docs( index_status: IndexStatus, vector_store: VectorStore, llm: LLMProvider, system_prompt: str, status_manager: "WikiStatusManager", entity_registry: EntityRegistry, config: Config, progress_callback: ProgressCallback | None = None, full_rebuild: bool = False, write_callback: WriteCallback | None = None, generation_progress: "GenerationProgress | None" = None, ) -> tuple[list[WikiPage], int, int]: """Generate documentation for individual source files. Uses parallel LLM calls for faster generation, controlled by config.wiki.max_concurrent_llm_calls. Pages are written to disk immediately as they complete if write_callback is provided. Args: index_status: Index status with file information. vector_store: Vector store with indexed code. llm: LLM provider for generation. system_prompt: System prompt for LLM. status_manager: Wiki status manager for incremental updates. entity_registry: Entity registry for cross-linking. config: Configuration. progress_callback: Optional progress callback. full_rebuild: If True, regenerate all pages. write_callback: Optional async callback to write pages immediately as they complete. generation_progress: Optional live progress tracker for status updates. Returns: Tuple of (pages list, generated count, skipped count). """ significant_files = _filter_significant_files(index_status.files, config.wiki.max_file_docs) if not significant_files: return [], 0, 0 # Use semaphore to limit concurrent LLM calls max_concurrent = config.wiki.max_concurrent_llm_calls semaphore = asyncio.Semaphore(max_concurrent) logger.info( f"Generating file docs for {len(significant_files)} files " f"(max {max_concurrent} concurrent)" ) if generation_progress: generation_progress.start_phase("file_docs", total=len(significant_files)) async def generate_with_semaphore( file_info: FileInfo, ) -> tuple[FileInfo, WikiPage | None, bool]: """Generate doc for a file, returning file_info for tracking.""" async with semaphore: logger.debug(f"Generating doc for {file_info.path}") page, was_skipped = await generate_single_file_doc( file_info=file_info, index_status=index_status, vector_store=vector_store, llm=llm, system_prompt=system_prompt, status_manager=status_manager, entity_registry=entity_registry, config=config, full_rebuild=full_rebuild, ) return file_info, page, was_skipped # Create and process tasks tasks = [asyncio.create_task(generate_with_semaphore(f)) for f in significant_files] pages: list[WikiPage] = [] pages_generated = 0 pages_skipped = 0 for coro in asyncio.as_completed(tasks): try: file_info, page, was_skipped = await coro if page is not None: pages.append(page) if was_skipped: pages_skipped += 1 else: pages_generated += 1 if write_callback: await write_callback(page) if progress_callback: progress_callback(f"Generated {file_info.path}", len(pages), len(tasks)) if generation_progress: generation_progress.complete_file(file_info.path) except Exception as e: logger.error(f"Error generating file doc: {e}") if generation_progress: generation_progress.complete_file() # Create files index if pages: files_index = _create_files_index_page(pages, significant_files, status_manager) pages.insert(0, files_index) if generation_progress: generation_progress.complete_phase() logger.info(f"File docs complete: {pages_generated} generated, {pages_skipped} skipped") return pages, pages_generated, pages_skipped def _generate_blame_section( repo_path: Path, file_path: str, chunks: list[CodeChunk], ) -> str | None: """Generate a "Last Modified" section with git blame info. Args: repo_path: Path to the repository root. file_path: Relative path to the source file. chunks: Code chunks from the file. Returns: Markdown section or None if no blame info available. """ # Build entity list for blame lookup entities: list[tuple[str, str, int, int]] = [] for chunk in chunks: if chunk.name and chunk.chunk_type in ( ChunkType.CLASS, ChunkType.FUNCTION, ChunkType.METHOD, ): entities.append( ( chunk.name, chunk.chunk_type.value, chunk.start_line, chunk.end_line, ) ) if not entities: return None # Get blame info for all entities blame_infos = get_file_entity_blame(repo_path, file_path, entities) if not blame_infos: return None # Sort by most recently modified first blame_infos.sort(key=lambda b: b.last_modified_date, reverse=True) # Build the section lines = [ "## Last Modified", "", "| Entity | Type | Author | Date | Commit |", "|--------|------|--------|------|--------|", ] for blame in blame_infos: entity_name = blame.entity_name entity_type = blame.entity_type author = blame.last_modified_by date_str = format_blame_date(blame.last_modified_date) commit_short = blame.commit_hash[:7] # Truncate long author names if len(author) > 20: author = author[:17] + "..." # Add commit summary if available (truncated) commit_info = f"`{commit_short}`" if blame.commit_summary: summary = blame.commit_summary if len(summary) > 30: summary = summary[:27] + "..." commit_info = f"`{commit_short}` {summary}" lines.append(f"| `{entity_name}` | {entity_type} | {author} | {date_str} | {commit_info} |") return "\n".join(lines) def _generate_files_index(file_pages: list[WikiPage]) -> str: """Generate index page for file documentation. Args: file_pages: List of file wiki pages. Returns: Markdown content for files index. """ lines = [ "# Source Files\n", "Detailed documentation for individual source files.\n", ] # Group by directory by_dir: dict[str, list[WikiPage]] = {} for page in file_pages: if page.path == "files/index.md": continue parts = Path(page.path).parts if len(parts) > 2: dir_name = parts[1] # files/DIR/file.md -> DIR else: dir_name = "root" by_dir.setdefault(dir_name, []).append(page) for dir_name, dir_pages in sorted(by_dir.items()): lines.append(f"\n## {dir_name}\n") for page in sorted(dir_pages, key=lambda p: p.title): # Make relative link from files/index.md rel_path = page.path.replace("files/", "") lines.append(f"- [{page.title}]({rel_path})") return "\n".join(lines)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/UrbanDiver/local-deepwiki-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

wiki_files.py•30.3 KiB