"""Integration tests for the full index -> wiki -> export pipeline.
This test module validates the complete end-to-end flow of:
1. Indexing a repository to extract code chunks and create embeddings
2. Generating wiki documentation from the indexed content
3. Exporting the wiki to HTML format
All tests use temporary directories and mock LLM providers to avoid
external dependencies.
"""
import json
import time
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from local_deepwiki.config import ChunkingConfig, Config, ParsingConfig, WikiConfig
from local_deepwiki.core.indexer import RepositoryIndexer
from local_deepwiki.export.html import HtmlExporter, export_to_html
from local_deepwiki.generators.wiki import WikiGenerator, generate_wiki
from local_deepwiki.models import (
ChunkType,
CodeChunk,
IndexStatus,
Language,
WikiPage,
WikiStructure,
)
# =============================================================================
# Fixtures
# =============================================================================
@pytest.fixture
def sample_python_repo(tmp_path: Path) -> Path:
"""Create a sample Python repository with multiple files for testing.
Creates a realistic repository structure with:
- A main module with classes and functions
- A utils module with helper functions
- A simple test file
Args:
tmp_path: Pytest fixture for temporary directory.
Returns:
Path to the created repository.
"""
repo_path = tmp_path / "sample_repo"
repo_path.mkdir()
# Create src directory structure
src_dir = repo_path / "src"
src_dir.mkdir()
# Main module with classes
(src_dir / "main.py").write_text('''"""Main application module."""
class Application:
"""Main application class.
Handles initialization and lifecycle of the application.
"""
def __init__(self, config: dict):
"""Initialize the application.
Args:
config: Configuration dictionary.
"""
self.config = config
self._running = False
def start(self) -> None:
"""Start the application."""
self._running = True
print("Application started")
def stop(self) -> None:
"""Stop the application."""
self._running = False
print("Application stopped")
@property
def is_running(self) -> bool:
"""Check if application is running."""
return self._running
def create_app(config: dict) -> Application:
"""Factory function to create an Application instance.
Args:
config: Application configuration.
Returns:
Configured Application instance.
"""
return Application(config)
''')
# Utils module with helper functions
(src_dir / "utils.py").write_text('''"""Utility functions for the application."""
from typing import Any
def validate_config(config: dict) -> bool:
"""Validate the configuration dictionary.
Args:
config: Configuration to validate.
Returns:
True if valid, False otherwise.
"""
required_keys = ["name", "version"]
return all(key in config for key in required_keys)
def format_output(data: Any) -> str:
"""Format data for output.
Args:
data: Data to format.
Returns:
Formatted string representation.
"""
if isinstance(data, dict):
return "\\n".join(f"{k}: {v}" for k, v in data.items())
return str(data)
class ConfigLoader:
"""Loads configuration from various sources."""
@staticmethod
def from_dict(data: dict) -> dict:
"""Load config from a dictionary.
Args:
data: Raw configuration data.
Returns:
Processed configuration.
"""
return {"name": data.get("name", "default"), "version": data.get("version", "1.0")}
''')
# Test file
tests_dir = repo_path / "tests"
tests_dir.mkdir()
(tests_dir / "test_main.py").write_text('''"""Tests for main module."""
import pytest
from src.main import Application, create_app
def test_application_init():
"""Test Application initialization."""
app = Application({"name": "test"})
assert app.config == {"name": "test"}
assert not app.is_running
def test_create_app():
"""Test create_app factory function."""
app = create_app({"name": "test"})
assert isinstance(app, Application)
''')
return repo_path
@pytest.fixture
def mock_embedding_provider():
"""Create a mock embedding provider that returns consistent vectors."""
provider = MagicMock()
provider.embed_texts = AsyncMock(
side_effect=lambda texts: [[0.1] * 384 for _ in texts]
)
provider.embed_text = AsyncMock(return_value=[0.1] * 384)
return provider
def create_mock_vector_store():
"""Create a fully mocked vector store with all async methods."""
mock_store = MagicMock()
mock_store.create_or_update_table = AsyncMock(return_value=0)
mock_store.add_chunks = AsyncMock(return_value=0)
mock_store.delete_chunks_by_files = AsyncMock(return_value=0)
mock_store.delete_chunks_by_file = AsyncMock(return_value=0)
mock_store.search = AsyncMock(return_value=[])
mock_store.get_chunks_by_file = AsyncMock(return_value=[])
mock_store._get_table = MagicMock(return_value=None)
return mock_store
@pytest.fixture
def mock_llm_provider():
"""Create a mock LLM provider that returns canned responses."""
provider = MagicMock()
async def mock_generate(prompt: str, **kwargs) -> str:
"""Generate mock responses based on prompt content."""
if "overview" in prompt.lower() or "index" in prompt.lower():
return "# Sample Repo\n\nThis is a sample Python application.\n\n## Features\n\n- Application lifecycle management\n- Configuration validation\n- Utility functions"
if "architecture" in prompt.lower():
return "# Architecture\n\nThe application follows a modular architecture.\n\n## Components\n\n- Main application class\n- Utility modules\n- Test suite"
if "dependencies" in prompt.lower():
return "# Dependencies\n\nThis project has minimal dependencies.\n\n## Runtime Dependencies\n\n- Python 3.11+"
if "module" in prompt.lower():
return "# Module Documentation\n\nThis module provides core functionality.\n\n## Functions\n\n- Core functions for the module"
# Default response for file documentation
return "# File Documentation\n\nThis file contains implementation details.\n\n## Contents\n\n- Classes and functions"
provider.generate = AsyncMock(side_effect=mock_generate)
return provider
@pytest.fixture
def test_config() -> Config:
"""Create a test configuration optimized for integration tests."""
chunking = ChunkingConfig().model_copy(
update={"batch_size": 10, "max_chunk_size": 2000}
)
parsing = ParsingConfig().model_copy(update={"languages": ["python"]})
wiki = WikiConfig().model_copy(update={"max_concurrent_llm": 2})
return Config().model_copy(
update={"chunking": chunking, "parsing": parsing, "wiki": wiki}
)
# =============================================================================
# Index Tests
# =============================================================================
class TestIndexingPipeline:
"""Tests for the repository indexing phase."""
async def test_index_creates_chunks(
self, sample_python_repo: Path, test_config: Config, mock_embedding_provider
):
"""Test that indexing extracts code chunks from source files."""
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = MagicMock()
chunks_stored = []
async def capture_create(chunks):
chunks_stored.extend(chunks)
return len(chunks)
async def capture_add(chunks):
chunks_stored.extend(chunks)
return len(chunks)
mock_store.create_or_update_table = AsyncMock(side_effect=capture_create)
mock_store.add_chunks = AsyncMock(side_effect=capture_add)
mock_store.delete_chunks_by_files = AsyncMock(return_value=0)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(sample_python_repo, test_config)
indexer.vector_store = mock_store
status = await indexer.index(full_rebuild=True)
# Verify index status
assert status.total_files > 0
assert status.total_chunks > 0
assert "python" in status.languages
# Verify chunks were extracted
assert len(chunks_stored) > 0
# Verify chunk types - should have classes and functions
chunk_types = {c.chunk_type for c in chunks_stored}
assert ChunkType.CLASS in chunk_types or ChunkType.FUNCTION in chunk_types
async def test_index_creates_status_file(
self, sample_python_repo: Path, test_config: Config
):
"""Test that indexing creates an index_status.json file."""
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = MagicMock()
mock_store.create_or_update_table = AsyncMock(return_value=0)
mock_store.add_chunks = AsyncMock(return_value=0)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(sample_python_repo, test_config)
indexer.vector_store = mock_store
await indexer.index(full_rebuild=True)
# Check status file was created
status_file = indexer.wiki_path / "index_status.json"
assert status_file.exists()
# Verify status file content
with open(status_file) as f:
data = json.load(f)
assert "repo_path" in data
assert "total_files" in data
assert "total_chunks" in data
assert "languages" in data
async def test_incremental_indexing(
self, sample_python_repo: Path, test_config: Config
):
"""Test that incremental indexing only processes changed files."""
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = MagicMock()
create_calls = []
async def track_create(chunks):
create_calls.append(len(chunks))
return len(chunks)
async def track_add(chunks):
create_calls.append(len(chunks))
return len(chunks)
mock_store.create_or_update_table = AsyncMock(side_effect=track_create)
mock_store.add_chunks = AsyncMock(side_effect=track_add)
mock_store.delete_chunks_by_files = AsyncMock(return_value=0)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(sample_python_repo, test_config)
indexer.vector_store = mock_store
# First index
status1 = await indexer.index(full_rebuild=True)
first_chunks = sum(create_calls)
create_calls.clear()
# Second index without changes (incremental)
status2 = await indexer.index(full_rebuild=False)
# No new chunks should be created if nothing changed
second_chunks = sum(create_calls)
# The incremental index should process fewer or same number of chunks
# (depending on file modification detection)
assert status2.total_files == status1.total_files
# =============================================================================
# Wiki Generation Tests
# =============================================================================
class TestWikiGenerationPipeline:
"""Tests for the wiki generation phase."""
@pytest.fixture
def mock_index_status(self, sample_python_repo: Path) -> IndexStatus:
"""Create a mock index status for wiki generation tests."""
from local_deepwiki.models import FileInfo, Language
return IndexStatus(
repo_path=str(sample_python_repo),
indexed_at=time.time(),
total_files=3,
total_chunks=10,
languages={"python": 3},
files=[
FileInfo(
path="src/main.py",
language=Language.PYTHON,
size_bytes=1000,
last_modified=time.time(),
hash="abc123",
chunk_count=5,
),
FileInfo(
path="src/utils.py",
language=Language.PYTHON,
size_bytes=800,
last_modified=time.time(),
hash="def456",
chunk_count=3,
),
FileInfo(
path="tests/test_main.py",
language=Language.PYTHON,
size_bytes=500,
last_modified=time.time(),
hash="ghi789",
chunk_count=2,
),
],
)
async def test_wiki_generates_pages(
self,
tmp_path: Path,
mock_index_status: IndexStatus,
mock_llm_provider,
mock_embedding_provider,
):
"""Test that wiki generation creates documentation pages."""
wiki_path = tmp_path / ".deepwiki"
wiki_path.mkdir()
mock_store = create_mock_vector_store()
with patch("local_deepwiki.generators.wiki.get_cached_llm_provider") as mock_get_llm:
mock_get_llm.return_value = mock_llm_provider
generator = WikiGenerator(
wiki_path=wiki_path,
vector_store=mock_store,
)
generator.llm = mock_llm_provider
# Patch the internal calls that need the vector store
with patch.object(
generator, "_get_main_definition_lines", return_value={}
):
wiki_structure = await generator.generate(
index_status=mock_index_status,
full_rebuild=True,
)
# Verify pages were generated
assert len(wiki_structure.pages) > 0
# Should have at least index and architecture pages
page_paths = [p.path for p in wiki_structure.pages]
assert "index.md" in page_paths
async def test_wiki_writes_files(
self,
tmp_path: Path,
mock_index_status: IndexStatus,
mock_llm_provider,
):
"""Test that wiki generation writes markdown files to disk."""
wiki_path = tmp_path / ".deepwiki"
wiki_path.mkdir()
mock_store = create_mock_vector_store()
with patch("local_deepwiki.generators.wiki.get_cached_llm_provider") as mock_get_llm:
mock_get_llm.return_value = mock_llm_provider
generator = WikiGenerator(
wiki_path=wiki_path,
vector_store=mock_store,
)
generator.llm = mock_llm_provider
with patch.object(
generator, "_get_main_definition_lines", return_value={}
):
await generator.generate(
index_status=mock_index_status,
full_rebuild=True,
)
# Verify files were written
md_files = list(wiki_path.glob("**/*.md"))
assert len(md_files) > 0
# Verify index.md exists and has content
index_file = wiki_path / "index.md"
assert index_file.exists()
content = index_file.read_text()
assert len(content) > 0
# =============================================================================
# HTML Export Tests
# =============================================================================
class TestHtmlExportPipeline:
"""Tests for the HTML export phase."""
@pytest.fixture
def wiki_with_content(self, tmp_path: Path) -> Path:
"""Create a wiki directory with sample markdown content."""
wiki_path = tmp_path / ".deepwiki"
wiki_path.mkdir()
# Create index.md
(wiki_path / "index.md").write_text(
"# Sample Project\n\nThis is the main page.\n\n## Overview\n\nProject overview here."
)
# Create architecture.md
(wiki_path / "architecture.md").write_text(
"# Architecture\n\nArchitecture documentation.\n\n## Components\n\n- Component A\n- Component B"
)
# Create nested structure
files_dir = wiki_path / "files"
files_dir.mkdir()
(files_dir / "main.md").write_text(
"# main.py\n\nMain module documentation.\n\n## Classes\n\n### Application"
)
# Create toc.json
toc_data = {
"title": "Sample Project",
"entries": [
{"number": "1", "title": "Overview", "path": "index.md"},
{"number": "2", "title": "Architecture", "path": "architecture.md"},
{
"number": "3",
"title": "Files",
"children": [
{"number": "3.1", "title": "main.py", "path": "files/main.md"}
],
},
],
}
(wiki_path / "toc.json").write_text(json.dumps(toc_data))
# Create search.json
search_data = [
{
"path": "index.md",
"title": "Overview",
"snippet": "Project overview here.",
"headings": ["Sample Project", "Overview"],
"terms": ["overview", "project"],
},
{
"path": "architecture.md",
"title": "Architecture",
"snippet": "Architecture documentation.",
"headings": ["Architecture", "Components"],
"terms": ["architecture", "components"],
},
]
(wiki_path / "search.json").write_text(json.dumps(search_data))
return wiki_path
def test_html_export_creates_files(
self, wiki_with_content: Path, tmp_path: Path
):
"""Test that HTML export creates HTML files from markdown."""
output_path = tmp_path / "html_output"
exporter = HtmlExporter(wiki_with_content, output_path)
count = exporter.export()
# Verify HTML files were created
assert count > 0
assert output_path.exists()
html_files = list(output_path.glob("**/*.html"))
assert len(html_files) == count
# Verify index.html was created
index_html = output_path / "index.html"
assert index_html.exists()
def test_html_export_includes_toc(
self, wiki_with_content: Path, tmp_path: Path
):
"""Test that exported HTML includes table of contents."""
output_path = tmp_path / "html_output"
exporter = HtmlExporter(wiki_with_content, output_path)
exporter.export()
# Read index.html and check for TOC elements
index_html = output_path / "index.html"
content = index_html.read_text()
# Should have TOC structure
assert "toc" in content.lower()
assert "Overview" in content
assert "Architecture" in content
def test_html_export_preserves_structure(
self, wiki_with_content: Path, tmp_path: Path
):
"""Test that HTML export preserves directory structure."""
output_path = tmp_path / "html_output"
exporter = HtmlExporter(wiki_with_content, output_path)
exporter.export()
# Check nested files were created
nested_html = output_path / "files" / "main.html"
assert nested_html.exists()
def test_html_export_copies_search_json(
self, wiki_with_content: Path, tmp_path: Path
):
"""Test that HTML export copies search.json for client-side search."""
output_path = tmp_path / "html_output"
exporter = HtmlExporter(wiki_with_content, output_path)
exporter.export()
# Check search.json was copied
search_json = output_path / "search.json"
assert search_json.exists()
def test_export_to_html_convenience_function(
self, wiki_with_content: Path, tmp_path: Path
):
"""Test the export_to_html convenience function."""
output_path = tmp_path / "html_output"
result = export_to_html(wiki_with_content, output_path)
assert "Exported" in result
assert output_path.exists()
# =============================================================================
# Full Pipeline Integration Tests
# =============================================================================
class TestFullPipeline:
"""End-to-end integration tests for the complete pipeline."""
async def test_full_pipeline_index_to_wiki(
self,
sample_python_repo: Path,
test_config: Config,
mock_llm_provider,
):
"""Test the full pipeline from indexing to wiki generation."""
# Step 1: Index the repository
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = create_mock_vector_store()
stored_chunks = []
async def store_chunks(chunks):
stored_chunks.extend(chunks)
return len(chunks)
mock_store.create_or_update_table = AsyncMock(side_effect=store_chunks)
mock_store.add_chunks = AsyncMock(side_effect=store_chunks)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(sample_python_repo, test_config)
indexer.vector_store = mock_store
index_status = await indexer.index(full_rebuild=True)
# Verify indexing succeeded
assert index_status.total_files > 0
assert index_status.total_chunks > 0
# Step 2: Generate wiki documentation
wiki_path = indexer.wiki_path
with patch(
"local_deepwiki.generators.wiki.get_cached_llm_provider"
) as mock_get_llm:
mock_get_llm.return_value = mock_llm_provider
generator = WikiGenerator(
wiki_path=wiki_path,
vector_store=mock_store,
config=test_config,
)
generator.llm = mock_llm_provider
with patch.object(
generator, "_get_main_definition_lines", return_value={}
):
wiki_structure = await generator.generate(
index_status=index_status,
full_rebuild=True,
)
# Verify wiki was generated
assert len(wiki_structure.pages) > 0
assert (wiki_path / "index.md").exists()
async def test_full_pipeline_with_html_export(
self,
sample_python_repo: Path,
test_config: Config,
mock_llm_provider,
tmp_path: Path,
):
"""Test the complete pipeline including HTML export."""
# Step 1: Index
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = create_mock_vector_store()
async def store_chunks(chunks):
return len(chunks)
mock_store.create_or_update_table = AsyncMock(side_effect=store_chunks)
mock_store.add_chunks = AsyncMock(side_effect=store_chunks)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(sample_python_repo, test_config)
indexer.vector_store = mock_store
index_status = await indexer.index(full_rebuild=True)
# Step 2: Generate wiki
wiki_path = indexer.wiki_path
with patch(
"local_deepwiki.generators.wiki.get_cached_llm_provider"
) as mock_get_llm:
mock_get_llm.return_value = mock_llm_provider
generator = WikiGenerator(
wiki_path=wiki_path,
vector_store=mock_store,
config=test_config,
)
generator.llm = mock_llm_provider
with patch.object(
generator, "_get_main_definition_lines", return_value={}
):
await generator.generate(
index_status=index_status,
full_rebuild=True,
)
# Step 3: Export to HTML
html_output = tmp_path / "html_export"
result = export_to_html(wiki_path, html_output)
# Verify end-to-end success
assert "Exported" in result
assert html_output.exists()
assert (html_output / "index.html").exists()
async def test_pipeline_progress_callback(
self,
sample_python_repo: Path,
test_config: Config,
):
"""Test that progress callbacks are invoked during pipeline execution."""
progress_messages = []
def progress_callback(msg: str, current: int, total: int):
progress_messages.append((msg, current, total))
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = MagicMock()
mock_store.create_or_update_table = AsyncMock(return_value=0)
mock_store.add_chunks = AsyncMock(return_value=0)
mock_store.delete_chunks_by_files = AsyncMock(return_value=0)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(sample_python_repo, test_config)
indexer.vector_store = mock_store
await indexer.index(full_rebuild=True, progress_callback=progress_callback)
# Verify progress was reported
assert len(progress_messages) > 0
# Verify message format
for msg, current, total in progress_messages:
assert isinstance(msg, str)
assert isinstance(current, int)
assert isinstance(total, int)
# =============================================================================
# Error Handling Tests
# =============================================================================
class TestPipelineErrorHandling:
"""Tests for error handling throughout the pipeline."""
async def test_index_handles_parse_errors(
self, tmp_path: Path, test_config: Config
):
"""Test that indexing handles files that fail to parse."""
repo_path = tmp_path / "repo"
repo_path.mkdir()
# Create a valid file
(repo_path / "valid.py").write_text("def valid_function(): pass")
# Create a file that will be processed but might have edge cases
(repo_path / "edge_case.py").write_text(
"# Just a comment file\n# No actual code\n"
)
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = MagicMock()
mock_store.create_or_update_table = AsyncMock(return_value=0)
mock_store.add_chunks = AsyncMock(return_value=0)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(repo_path, test_config)
indexer.vector_store = mock_store
# Should not raise, even with edge case files
status = await indexer.index(full_rebuild=True)
# Should have processed at least the valid file
assert status.total_files >= 1
async def test_index_handles_empty_repo(
self, tmp_path: Path, test_config: Config
):
"""Test that indexing handles empty repositories gracefully."""
repo_path = tmp_path / "empty_repo"
repo_path.mkdir()
with patch("local_deepwiki.core.indexer.VectorStore") as MockVectorStore:
mock_store = MagicMock()
mock_store.create_or_update_table = AsyncMock(return_value=0)
mock_store.add_chunks = AsyncMock(return_value=0)
MockVectorStore.return_value = mock_store
indexer = RepositoryIndexer(repo_path, test_config)
indexer.vector_store = mock_store
status = await indexer.index(full_rebuild=True)
assert status.total_files == 0
assert status.total_chunks == 0
def test_html_export_handles_missing_wiki(self, tmp_path: Path):
"""Test that HTML export handles non-existent wiki directory."""
non_existent = tmp_path / "non_existent_wiki"
output_path = tmp_path / "output"
# The exporter should handle missing wiki gracefully
exporter = HtmlExporter(non_existent, output_path)
# Export should complete without error (just export 0 pages)
count = exporter.export()
assert count == 0
def test_html_export_handles_invalid_toc(
self, tmp_path: Path
):
"""Test that HTML export handles invalid toc.json."""
wiki_path = tmp_path / ".deepwiki"
wiki_path.mkdir()
# Create index.md
(wiki_path / "index.md").write_text("# Test\n\nContent here.")
# Create invalid toc.json - note: json.loads will raise JSONDecodeError
# The HtmlExporter loads TOC before export, so we need to handle this
(wiki_path / "toc.json").write_text("not valid json {{{")
output_path = tmp_path / "output"
# The exporter should handle invalid JSON gracefully
# In the current implementation, it may raise an error
# Let's verify the behavior - if it raises, the test documents that
try:
exporter = HtmlExporter(wiki_path, output_path)
count = exporter.export()
# If no error, verify export worked
assert count > 0
except json.JSONDecodeError:
# This is expected behavior - invalid JSON causes an error
# The test passes by documenting this behavior
pass