Local DeepWiki MCP Server

test_vectorstore.py•148 KiB

"""Tests for vector store functionality.""" import asyncio import time import pytest from local_deepwiki.config import EmbeddingBatchConfig, SearchCacheConfig from local_deepwiki.models import ChunkType, CodeChunk, Language from local_deepwiki.providers.base import EmbeddingProvider class MockEmbeddingProvider(EmbeddingProvider): """Mock embedding provider for testing.""" def __init__(self, dimension: int = 384, name: str = "mock"): self._dimension = dimension self._name = name self.embed_calls: list[list[str]] = [] @property def name(self) -> str: """Return provider name.""" return self._name def get_dimension(self) -> int: """Return embedding dimension.""" return self._dimension async def embed(self, texts: list[str]) -> list[list[float]]: """Generate mock embeddings.""" self.embed_calls.append(texts) return [[0.1] * self._dimension for _ in texts] class SlowMockEmbeddingProvider(EmbeddingProvider): """Mock embedding provider with configurable delay for testing parallel execution.""" def __init__(self, dimension: int = 384, delay_seconds: float = 0.1, name: str = "local:slow-mock"): self._dimension = dimension self._delay_seconds = delay_seconds self._name = name self.embed_calls: list[list[str]] = [] self.call_times: list[float] = [] @property def name(self) -> str: """Return provider name.""" return self._name # Configurable to test different provider types def get_dimension(self) -> int: """Return embedding dimension.""" return self._dimension async def embed(self, texts: list[str]) -> list[list[float]]: """Generate mock embeddings with delay.""" self.call_times.append(time.time()) self.embed_calls.append(texts) await asyncio.sleep(self._delay_seconds) return [[0.1] * self._dimension for _ in texts] class FailingMockEmbeddingProvider(EmbeddingProvider): """Mock embedding provider that fails for testing error handling.""" def __init__( self, dimension: int = 384, fail_count: int = 2, fail_on_batches: set[int] | None = None, ): self._dimension = dimension self._fail_count = fail_count self._call_count = 0 self._fail_on_batches = fail_on_batches or set() self._batch_call_counts: dict[int, int] = {} self.embed_calls: list[list[str]] = [] @property def name(self) -> str: """Return provider name.""" return "mock:failing" def get_dimension(self) -> int: """Return embedding dimension.""" return self._dimension async def embed(self, texts: list[str]) -> list[list[float]]: """Generate mock embeddings, failing on specified conditions.""" self.embed_calls.append(texts) self._call_count += 1 # Track batch-specific call counts (based on first text as identifier) batch_id = hash(texts[0]) if texts else 0 self._batch_call_counts[batch_id] = self._batch_call_counts.get(batch_id, 0) + 1 # Fail if this batch should fail and hasn't exceeded retry count if self._fail_on_batches and batch_id in self._fail_on_batches: if self._batch_call_counts[batch_id] <= self._fail_count: raise ConnectionError(f"Simulated connection error (attempt {self._batch_call_counts[batch_id]})") # Otherwise fail for first N calls globally if self._call_count <= self._fail_count: raise ConnectionError(f"Simulated connection error (call {self._call_count})") return [[0.1] * self._dimension for _ in texts] class RateLimitMockEmbeddingProvider(EmbeddingProvider): """Mock embedding provider that simulates rate limiting.""" def __init__(self, dimension: int = 384, rate_limit_after: int = 3): self._dimension = dimension self._rate_limit_after = rate_limit_after self._call_count = 0 self.embed_calls: list[list[str]] = [] @property def name(self) -> str: """Return provider name.""" return "openai:rate-limited" # Simulates API provider def get_dimension(self) -> int: """Return embedding dimension.""" return self._dimension async def embed(self, texts: list[str]) -> list[list[float]]: """Generate mock embeddings, simulating rate limit after N calls.""" self.embed_calls.append(texts) self._call_count += 1 if self._call_count == self._rate_limit_after: raise Exception("Rate limit exceeded. Please retry after 60 seconds.") return [[0.1] * self._dimension for _ in texts] class SemanticMockEmbeddingProvider(EmbeddingProvider): """Mock embedding provider that generates different embeddings based on query content. This allows testing semantic similarity by returning similar embeddings for similar queries and different embeddings for different queries. """ def __init__(self, dimension: int = 384): self._dimension = dimension self.embed_calls: list[list[str]] = [] @property def name(self) -> str: """Return provider name.""" return "semantic_mock" def get_dimension(self) -> int: """Return embedding dimension.""" return self._dimension async def embed(self, texts: list[str]) -> list[list[float]]: """Generate embeddings based on text content. Uses a hash-based approach to generate deterministic but VERY different embeddings for different texts. The embeddings are designed so that different texts have low cosine similarity (<0.9) to ensure cache misses for different queries. """ import math self.embed_calls.append(texts) embeddings = [] for text in texts: # Create a deterministic but highly content-dependent embedding # Use hash to seed a pseudo-random pattern that varies significantly embedding = [] text_hash = hash(text) & 0xFFFFFFFF # Ensure positive for i in range(self._dimension): # Use different seeds and transforms to maximize variation seed = (text_hash * (i + 1) * 31337) & 0xFFFFFFFF # Use sine/cosine transforms for more varied values val = 0.5 + 0.5 * math.sin(seed * 0.0001 + i * 0.1) embedding.append(val) embeddings.append(embedding) return embeddings def make_chunk( id: str, file_path: str = "test.py", content: str = "test code", language: Language = Language.PYTHON, chunk_type: ChunkType = ChunkType.FUNCTION, ) -> CodeChunk: """Create a test code chunk.""" return CodeChunk( id=id, file_path=file_path, language=language, chunk_type=chunk_type, name=f"test_{id}", content=content, start_line=1, end_line=10, ) class TestVectorStoreIndexes: """Tests for vector store scalar indexes.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) @pytest.fixture async def populated_store(self, vector_store): """Create a vector store with test data.""" chunks = [ make_chunk("chunk_1", "src/main.py", "def main(): pass"), make_chunk("chunk_2", "src/main.py", "def helper(): pass"), make_chunk("chunk_3", "src/utils.py", "def util(): pass"), make_chunk("chunk_4", "tests/test.py", "def test(): pass"), ] await vector_store.create_or_update_table(chunks) return vector_store async def test_create_table_creates_indexes(self, populated_store): """Test that creating a table creates scalar indexes.""" table = populated_store._get_table() assert table is not None # Check that indexes exist indexes = {idx["name"] for idx in table.list_indices()} # Index names are based on column names assert "id_idx" in indexes or any("id" in idx for idx in indexes) async def test_get_chunk_by_id_uses_index(self, populated_store): """Test that get_chunk_by_id can find chunks efficiently.""" # Should find existing chunk chunk = await populated_store.get_chunk_by_id("chunk_1") assert chunk is not None assert chunk.id == "chunk_1" assert chunk.file_path == "src/main.py" # Should return None for non-existent chunk chunk = await populated_store.get_chunk_by_id("nonexistent") assert chunk is None async def test_get_chunks_by_file_uses_index(self, populated_store): """Test that get_chunks_by_file can find chunks efficiently.""" # Get all chunks for main.py chunks = await populated_store.get_chunks_by_file("src/main.py") assert len(chunks) == 2 assert all(c.file_path == "src/main.py" for c in chunks) # Get chunks for different file chunks = await populated_store.get_chunks_by_file("src/utils.py") assert len(chunks) == 1 assert chunks[0].id == "chunk_3" # Non-existent file returns empty list chunks = await populated_store.get_chunks_by_file("nonexistent.py") assert chunks == [] async def test_delete_chunks_by_file_uses_index(self, populated_store): """Test that delete_chunks_by_file works efficiently.""" # Verify chunks exist before delete chunks_before = await populated_store.get_chunks_by_file("src/main.py") assert len(chunks_before) == 2 # Delete chunks for main.py await populated_store.delete_chunks_by_file("src/main.py") # Verify deletion by checking chunks are gone chunks = await populated_store.get_chunks_by_file("src/main.py") assert len(chunks) == 0 # Other files unaffected chunks = await populated_store.get_chunks_by_file("src/utils.py") assert len(chunks) == 1 async def test_delete_chunks_by_files_batch(self, populated_store): """Test that delete_chunks_by_files deletes multiple files in one operation.""" # Verify chunks exist before delete chunks_main = await populated_store.get_chunks_by_file("src/main.py") chunks_utils = await populated_store.get_chunks_by_file("src/utils.py") assert len(chunks_main) == 2 assert len(chunks_utils) == 1 # Batch delete chunks for both files result = await populated_store.delete_chunks_by_files(["src/main.py", "src/utils.py"]) assert result == 2 # Returns count of file paths processed # Verify all chunks are gone chunks = await populated_store.get_chunks_by_file("src/main.py") assert len(chunks) == 0 chunks = await populated_store.get_chunks_by_file("src/utils.py") assert len(chunks) == 0 async def test_delete_chunks_by_files_empty_list(self, populated_store): """Test that delete_chunks_by_files handles empty list.""" result = await populated_store.delete_chunks_by_files([]) assert result == 0 # Verify nothing was deleted chunks = await populated_store.get_chunks_by_file("src/main.py") assert len(chunks) == 2 async def test_delete_chunks_by_files_nonexistent(self, populated_store): """Test that delete_chunks_by_files handles nonexistent files gracefully.""" result = await populated_store.delete_chunks_by_files(["nonexistent1.py", "nonexistent2.py"]) assert result == 2 # Returns count of paths processed, even if no rows matched # Verify existing chunks unaffected chunks = await populated_store.get_chunks_by_file("src/main.py") assert len(chunks) == 2 async def test_delete_chunks_by_files_with_quotes(self, vector_store): """Test batch delete with file paths containing quotes.""" chunks = [ make_chunk("test1", file_path="path'one.py"), make_chunk("test2", file_path="path'two.py"), make_chunk("test3", file_path="normal.py"), ] await vector_store.create_or_update_table(chunks) # Batch delete files with quotes await vector_store.delete_chunks_by_files(["path'one.py", "path'two.py"]) # Verify deletion chunks = await vector_store.get_chunks_by_file("path'one.py") assert len(chunks) == 0 chunks = await vector_store.get_chunks_by_file("path'two.py") assert len(chunks) == 0 # Normal file unaffected chunks = await vector_store.get_chunks_by_file("normal.py") assert len(chunks) == 1 async def test_ensure_indexes_on_existing_table(self, vector_store, tmp_path): """Test that opening an existing table ensures indexes exist.""" # Create table with data chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) # Create new store instance pointing to same DB from local_deepwiki.core.vectorstore import VectorStore new_store = VectorStore(tmp_path / "test.lance", MockEmbeddingProvider()) # Get table (should ensure indexes) table = new_store._get_table() assert table is not None # Should be able to use indexed lookups chunk = await new_store.get_chunk_by_id("test_1") assert chunk is not None class TestVectorStoreSearch: """Tests for vector store search functionality.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) async def test_search_empty_store(self, vector_store): """Test searching an empty store returns empty results.""" results = await vector_store.search("test query") assert results == [] async def test_search_with_results(self, vector_store): """Test searching returns results.""" chunks = [ make_chunk("func_1", content="def calculate_sum(a, b): return a + b"), make_chunk("func_2", content="def calculate_product(a, b): return a * b"), ] await vector_store.create_or_update_table(chunks) results = await vector_store.search("calculate") assert len(results) > 0 assert all(r.chunk is not None for r in results) assert all(r.score >= 0 for r in results) async def test_search_with_language_filter(self, vector_store): """Test searching with language filter.""" chunks = [ make_chunk("py_1", language=Language.PYTHON), make_chunk("ts_1", language=Language.TYPESCRIPT), ] await vector_store.create_or_update_table(chunks) results = await vector_store.search("test", language="python") assert all(r.chunk.language == Language.PYTHON for r in results) async def test_search_invalid_language_raises(self, vector_store): """Test searching with invalid language raises ValueError.""" chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with pytest.raises(ValueError, match="Invalid language filter"): await vector_store.search("test", language="invalid_lang") async def test_search_with_chunk_type_filter(self, vector_store): """Test searching with chunk type filter.""" chunks = [ make_chunk("func_1", chunk_type=ChunkType.FUNCTION), make_chunk("class_1", chunk_type=ChunkType.CLASS), ] await vector_store.create_or_update_table(chunks) results = await vector_store.search("test", chunk_type="function") assert all(r.chunk.chunk_type == ChunkType.FUNCTION for r in results) async def test_search_invalid_chunk_type_raises(self, vector_store): """Test searching with invalid chunk type raises ValueError.""" chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with pytest.raises(ValueError, match="Invalid chunk_type filter"): await vector_store.search("test", chunk_type="invalid_type") class TestVectorStoreStats: """Tests for vector store statistics.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) def test_stats_empty_store(self, vector_store): """Test stats for empty store.""" stats = vector_store.get_stats() assert stats["total_chunks"] == 0 assert stats["languages"] == {} assert stats["chunk_types"] == {} async def test_stats_with_data(self, vector_store): """Test stats with data.""" chunks = [ make_chunk("py_func", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION), make_chunk("py_class", language=Language.PYTHON, chunk_type=ChunkType.CLASS), make_chunk("ts_func", language=Language.TYPESCRIPT, chunk_type=ChunkType.FUNCTION), ] await vector_store.create_or_update_table(chunks) stats = vector_store.get_stats() assert stats["total_chunks"] == 3 assert stats["languages"]["python"] == 2 assert stats["languages"]["typescript"] == 1 assert stats["chunk_types"]["function"] == 2 assert stats["chunk_types"]["class"] == 1 assert stats["files"] == 1 # All use default file_path class TestVectorStoreAddChunks: """Tests for adding chunks to existing table.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) async def test_add_to_empty_creates_table(self, vector_store): """Test adding to empty store creates table.""" chunks = [make_chunk("test_1")] count = await vector_store.add_chunks(chunks) assert count == 1 # Verify data exists stats = vector_store.get_stats() assert stats["total_chunks"] == 1 async def test_add_to_existing_table(self, vector_store): """Test adding chunks to existing table.""" # Create initial table initial = [make_chunk("initial_1")] await vector_store.create_or_update_table(initial) # Add more chunks additional = [make_chunk("additional_1"), make_chunk("additional_2")] count = await vector_store.add_chunks(additional) assert count == 2 # Verify total stats = vector_store.get_stats() assert stats["total_chunks"] == 3 async def test_add_empty_list(self, vector_store): """Test adding empty list returns 0.""" count = await vector_store.add_chunks([]) assert count == 0 class TestVectorStoreEdgeCases: """Tests for vector store edge cases and error handling.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) # --- Empty database operations --- async def test_get_chunk_by_id_empty_db(self, vector_store): """Test get_chunk_by_id on empty database returns None.""" result = await vector_store.get_chunk_by_id("nonexistent") assert result is None async def test_get_chunks_by_file_empty_db(self, vector_store): """Test get_chunks_by_file on empty database returns empty list.""" result = await vector_store.get_chunks_by_file("nonexistent.py") assert result == [] async def test_delete_chunks_by_file_empty_db(self, vector_store): """Test delete_chunks_by_file on empty database returns 0.""" deleted = await vector_store.delete_chunks_by_file("nonexistent.py") assert deleted == 0 async def test_create_or_update_empty_list(self, vector_store): """Test create_or_update_table with empty list returns 0.""" result = await vector_store.create_or_update_table([]) assert result == 0 assert vector_store.get_stats()["total_chunks"] == 0 # --- Special characters and injection protection --- async def test_chunk_id_with_quotes(self, vector_store): """Test chunk ID with single quotes is handled safely.""" chunk = make_chunk("test'quote", content="test content") await vector_store.create_or_update_table([chunk]) # Should not raise or cause injection result = await vector_store.get_chunk_by_id("test'quote") assert result is not None assert result.id == "test'quote" async def test_file_path_with_quotes(self, vector_store): """Test file path with quotes is handled safely.""" chunk = make_chunk("test1", file_path="path'with'quotes.py") await vector_store.create_or_update_table([chunk]) # Should not raise or cause injection results = await vector_store.get_chunks_by_file("path'with'quotes.py") assert len(results) == 1 assert results[0].file_path == "path'with'quotes.py" async def test_delete_file_path_with_quotes(self, vector_store): """Test deleting file path with quotes is handled safely.""" chunk = make_chunk("test1", file_path="path'with'quotes.py") await vector_store.create_or_update_table([chunk]) # Should delete successfully without injection await vector_store.delete_chunks_by_file("path'with'quotes.py") # Verify deletion by checking chunks are gone chunks = await vector_store.get_chunks_by_file("path'with'quotes.py") assert len(chunks) == 0 async def test_chunk_id_injection_attempt(self, vector_store): """Test that SQL-like injection in chunk_id is neutralized.""" chunk = make_chunk("safe_chunk", content="test") await vector_store.create_or_update_table([chunk]) # Attempt injection - should return None, not cause error malicious_id = "'; DROP TABLE code_chunks; --" result = await vector_store.get_chunk_by_id(malicious_id) assert result is None # Original chunk should still exist result = await vector_store.get_chunk_by_id("safe_chunk") assert result is not None async def test_file_path_injection_attempt(self, vector_store): """Test that SQL-like injection in file_path is neutralized.""" chunk = make_chunk("chunk1", file_path="safe.py") await vector_store.create_or_update_table([chunk]) # Attempt injection - should return empty, not cause error malicious_path = "' OR '1'='1" results = await vector_store.get_chunks_by_file(malicious_path) assert results == [] # Original chunk should still exist results = await vector_store.get_chunks_by_file("safe.py") assert len(results) == 1 async def test_unicode_content(self, vector_store): """Test handling of Unicode content in chunks.""" chunk = make_chunk("unicode_test", content="def hello(): return '你好世界 🌍 Привет мир'") await vector_store.create_or_update_table([chunk]) result = await vector_store.get_chunk_by_id("unicode_test") assert result is not None assert "你好世界" in result.content assert "🌍" in result.content # --- Database state handling --- async def test_reopen_database(self, tmp_path): """Test reopening database preserves data.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() # Create store and add data store1 = VectorStore(db_path, provider) chunk = make_chunk("persistent", content="test data") await store1.create_or_update_table([chunk]) # Create new store instance pointing to same path store2 = VectorStore(db_path, provider) # Should find the data result = await store2.get_chunk_by_id("persistent") assert result is not None assert result.id == "persistent" async def test_replace_existing_table(self, vector_store): """Test create_or_update_table replaces existing data.""" # Create initial data initial_chunks = [make_chunk("old_1"), make_chunk("old_2")] await vector_store.create_or_update_table(initial_chunks) assert vector_store.get_stats()["total_chunks"] == 2 # Replace with new data new_chunks = [make_chunk("new_1")] await vector_store.create_or_update_table(new_chunks) # Old data should be gone assert vector_store.get_stats()["total_chunks"] == 1 old_chunk = await vector_store.get_chunk_by_id("old_1") assert old_chunk is None new_chunk = await vector_store.get_chunk_by_id("new_1") assert new_chunk is not None async def test_db_path_created_if_not_exists(self, tmp_path): """Test that database directory is created if it doesn't exist.""" from local_deepwiki.core.vectorstore import VectorStore nested_path = tmp_path / "nested" / "deep" / "db.lance" provider = MockEmbeddingProvider() store = VectorStore(nested_path, provider) chunk = make_chunk("test") await store.create_or_update_table([chunk]) # Path should be created assert nested_path.parent.exists() # --- Boundary conditions --- async def test_single_chunk_operations(self, vector_store): """Test operations with single chunk.""" chunk = make_chunk("single", content="single test") await vector_store.create_or_update_table([chunk]) # Search results = await vector_store.search("single") assert len(results) == 1 # Get by ID result = await vector_store.get_chunk_by_id("single") assert result is not None # Stats stats = vector_store.get_stats() assert stats["total_chunks"] == 1 async def test_empty_content_chunk(self, vector_store): """Test chunk with empty content.""" chunk = make_chunk("empty_content", content="") await vector_store.create_or_update_table([chunk]) result = await vector_store.get_chunk_by_id("empty_content") assert result is not None assert result.content == "" async def test_large_content_chunk(self, vector_store): """Test chunk with large content.""" large_content = "x" * 100000 # 100KB of content chunk = make_chunk("large", content=large_content) await vector_store.create_or_update_table([chunk]) result = await vector_store.get_chunk_by_id("large") assert result is not None assert len(result.content) == 100000 async def test_many_chunks_same_file(self, vector_store): """Test many chunks from same file.""" chunks = [ make_chunk(f"chunk_{i}", file_path="big_file.py", content=f"content {i}") for i in range(50) ] await vector_store.create_or_update_table(chunks) # Get all chunks for file results = await vector_store.get_chunks_by_file("big_file.py") assert len(results) == 50 # Delete all await vector_store.delete_chunks_by_file("big_file.py") # Verify deletion by checking chunks are gone results = await vector_store.get_chunks_by_file("big_file.py") assert len(results) == 0 # --- Search edge cases --- async def test_search_limit_zero_raises(self, vector_store): """Test search with limit=0 raises ValueError.""" chunk = make_chunk("test") await vector_store.create_or_update_table([chunk]) # LanceDB requires limit > 0 for vector searches with pytest.raises(ValueError, match="Limit is required"): await vector_store.search("test", limit=0) async def test_search_very_long_query(self, vector_store): """Test search with very long query string.""" chunk = make_chunk("test", content="simple content") await vector_store.create_or_update_table([chunk]) long_query = "test " * 1000 # Very long query # Should not raise results = await vector_store.search(long_query, limit=5) # May or may not find results, but shouldn't crash assert isinstance(results, list) class TestVectorIndex: """Tests for vector index creation and management.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) async def test_vector_index_not_created_for_small_tables(self, vector_store): """Test that vector index is not created for tables with < 1000 rows.""" # Create a small table (4 chunks - well under 1000 threshold) chunks = [make_chunk(f"chunk_{i}") for i in range(4)] await vector_store.create_or_update_table(chunks) table = vector_store._get_table() assert table is not None # Check that we have scalar indexes but not necessarily vector index indexes = table.list_indices() scalar_index_names = { idx.get("name", "") if isinstance(idx, dict) else getattr(idx, "name", "") for idx in indexes } # Scalar indexes should exist assert any("id" in name for name in scalar_index_names) async def test_create_vector_index_method_exists(self, vector_store): """Test that _create_vector_index method exists and is callable.""" assert hasattr(vector_store, "_create_vector_index") assert callable(vector_store._create_vector_index) async def test_ensure_indexes_handles_missing_vector_index(self, vector_store): """Test that _ensure_indexes handles tables without vector index.""" # Create table chunks = [make_chunk(f"chunk_{i}") for i in range(10)] await vector_store.create_or_update_table(chunks) # Manually call _ensure_indexes (simulates reopening existing table) vector_store._ensure_indexes() # Should not raise and scalar indexes should still work chunk = await vector_store.get_chunk_by_id("chunk_1") assert chunk is not None async def test_vector_index_threshold_is_1000(self, vector_store): """Verify the threshold for vector index creation is 1000 rows.""" # This is a documentation test - verify the threshold is as expected # We don't create 1000+ rows in tests, but verify the logic exists import inspect source = inspect.getsource(vector_store._create_vector_index) assert "1000" in source or "min_rows_for_index" in source async def test_search_works_without_vector_index(self, vector_store): """Test that search works correctly even without vector index (brute force).""" # Create a small table without vector index chunks = [ make_chunk("chunk_1", content="hello world"), make_chunk("chunk_2", content="goodbye world"), make_chunk("chunk_3", content="hello there"), ] await vector_store.create_or_update_table(chunks) # Search should work (brute force O(n) without index) results = await vector_store.search("hello", limit=2) assert len(results) > 0 # All results should be valid chunks for result in results: assert result.chunk is not None assert result.chunk.id in ["chunk_1", "chunk_2", "chunk_3"] async def test_ensure_indexes_called_on_table_open(self, vector_store, tmp_path): """Test that _ensure_indexes is called when opening existing table.""" from local_deepwiki.core.vectorstore import VectorStore # Create table chunks = [make_chunk(f"chunk_{i}") for i in range(5)] await vector_store.create_or_update_table(chunks) # Create new VectorStore instance pointing to same DB provider = MockEmbeddingProvider() store2 = VectorStore(tmp_path / "test.lance", provider) # Access table (should trigger _ensure_indexes) table = store2._get_table() assert table is not None # Should still be able to search results = await store2.search("test", limit=5) assert isinstance(results, list) class TestEnsureIndexesEdgeCases: """Tests for _ensure_indexes edge cases and error handling.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) def test_ensure_indexes_when_table_is_none(self, vector_store): """Test _ensure_indexes returns early when table is None.""" # Table is None before any data is added assert vector_store._table is None # Should not raise vector_store._ensure_indexes() # Still None after call assert vector_store._table is None async def test_ensure_indexes_handles_list_indices_exception(self, vector_store): """Test _ensure_indexes handles exceptions from list_indices.""" from unittest.mock import MagicMock, patch # Create table first chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) # Mock list_indices to raise RuntimeError with patch.object(vector_store._table, "list_indices", side_effect=RuntimeError("Cannot list")): # Should not raise, just log debug and continue vector_store._ensure_indexes() async def test_ensure_indexes_handles_type_error(self, vector_store): """Test _ensure_indexes handles TypeError from list_indices.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object(vector_store._table, "list_indices", side_effect=TypeError("Bad type")): vector_store._ensure_indexes() async def test_ensure_indexes_handles_key_error(self, vector_store): """Test _ensure_indexes handles KeyError from index access.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object(vector_store._table, "list_indices", side_effect=KeyError("Missing key")): vector_store._ensure_indexes() async def test_ensure_indexes_handles_attribute_error(self, vector_store): """Test _ensure_indexes handles AttributeError from index access.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object(vector_store._table, "list_indices", side_effect=AttributeError("No attr")): vector_store._ensure_indexes() async def test_ensure_indexes_handles_count_rows_exception(self, vector_store): """Test _ensure_indexes handles exception when checking row count.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) # list_indices returns empty (so it tries to create vector index) # count_rows raises exception with patch.object(vector_store._table, "list_indices", return_value=[]): with patch.object(vector_store._table, "count_rows", side_effect=RuntimeError("DB error")): vector_store._ensure_indexes() async def test_ensure_indexes_creates_missing_id_index(self, vector_store): """Test _ensure_indexes creates id_idx when missing.""" from unittest.mock import patch, MagicMock chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) # Mock list_indices to return indexes without id_idx mock_indices = [{"name": "file_path_idx"}] with patch.object(vector_store._table, "list_indices", return_value=mock_indices): with patch.object(vector_store._table, "create_scalar_index") as mock_create: with patch.object(vector_store._table, "count_rows", return_value=10): vector_store._ensure_indexes() # Should have tried to create id index mock_create.assert_called() async def test_ensure_indexes_creates_missing_file_path_index(self, vector_store): """Test _ensure_indexes creates file_path_idx when missing.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) mock_indices = [{"name": "id_idx"}] with patch.object(vector_store._table, "list_indices", return_value=mock_indices): with patch.object(vector_store._table, "create_scalar_index") as mock_create: with patch.object(vector_store._table, "count_rows", return_value=10): vector_store._ensure_indexes() mock_create.assert_called() class TestCreateIndexSafeEdgeCases: """Tests for _create_index_safe edge cases.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) def test_create_index_safe_when_table_is_none(self, vector_store): """Test _create_index_safe returns early when table is None.""" assert vector_store._table is None # Should not raise vector_store._create_index_safe("id") async def test_create_index_safe_handles_value_error(self, vector_store): """Test _create_index_safe handles ValueError (index already exists).""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object( vector_store._table, "create_scalar_index", side_effect=ValueError("Index exists") ): # Should not raise vector_store._create_index_safe("test_column") async def test_create_index_safe_handles_runtime_error(self, vector_store): """Test _create_index_safe handles RuntimeError.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object( vector_store._table, "create_scalar_index", side_effect=RuntimeError("Creation failed") ): vector_store._create_index_safe("test_column") async def test_create_index_safe_handles_os_error(self, vector_store): """Test _create_index_safe handles OSError.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object( vector_store._table, "create_scalar_index", side_effect=OSError("Storage issue") ): vector_store._create_index_safe("test_column") class TestCreateVectorIndexEdgeCases: """Tests for _create_vector_index edge cases.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) def test_create_vector_index_when_table_is_none(self, vector_store): """Test _create_vector_index returns early when table is None.""" assert vector_store._table is None # Should not raise vector_store._create_vector_index(1000) async def test_create_vector_index_skipped_for_small_tables(self, vector_store): """Test _create_vector_index skips for tables under threshold.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object(vector_store._table, "create_index") as mock_create: vector_store._create_vector_index(999) # Just under threshold mock_create.assert_not_called() async def test_create_vector_index_creates_for_large_tables(self, vector_store): """Test _create_vector_index creates index for tables at threshold.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object(vector_store._table, "create_index") as mock_create: vector_store._create_vector_index(1000) # At threshold mock_create.assert_called_once() # Check it was called with correct params call_kwargs = mock_create.call_args[1] assert call_kwargs["metric"] == "L2" assert call_kwargs["num_sub_vectors"] == 16 async def test_create_vector_index_calculates_partitions(self, vector_store): """Test _create_vector_index calculates correct number of partitions.""" from unittest.mock import patch import math chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) # Test with 10000 rows -> sqrt(10000) = 100 partitions with patch.object(vector_store._table, "create_index") as mock_create: vector_store._create_vector_index(10000) call_kwargs = mock_create.call_args[1] assert call_kwargs["num_partitions"] == 100 # Test with very large table -> capped at 256 with patch.object(vector_store._table, "create_index") as mock_create: vector_store._create_vector_index(100000) call_kwargs = mock_create.call_args[1] assert call_kwargs["num_partitions"] == 256 async def test_create_vector_index_handles_value_error(self, vector_store): """Test _create_vector_index handles ValueError (index exists).""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object( vector_store._table, "create_index", side_effect=ValueError("Index exists") ): # Should not raise vector_store._create_vector_index(2000) async def test_create_vector_index_handles_runtime_error(self, vector_store): """Test _create_vector_index handles RuntimeError.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object( vector_store._table, "create_index", side_effect=RuntimeError("Creation failed") ): vector_store._create_vector_index(2000) async def test_create_vector_index_handles_os_error(self, vector_store): """Test _create_vector_index handles OSError.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) with patch.object( vector_store._table, "create_index", side_effect=OSError("Storage issue") ): vector_store._create_vector_index(2000) class TestBatchEmbed: """Tests for _batch_embed functionality.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) async def test_batch_embed_with_progress_logging(self, vector_store): """Test _batch_embed logs progress for large batches.""" texts = [f"text_{i}" for i in range(10)] # Small batch size to trigger multiple batches embeddings = await vector_store._batch_embed(texts, batch_size=3, log_progress=True) assert len(embeddings) == 10 # Each embedding should have correct dimension assert all(len(e) == 384 for e in embeddings) async def test_batch_embed_without_progress_logging(self, vector_store): """Test _batch_embed without progress logging.""" texts = [f"text_{i}" for i in range(10)] embeddings = await vector_store._batch_embed(texts, batch_size=3, log_progress=False) assert len(embeddings) == 10 async def test_batch_embed_single_batch(self, vector_store): """Test _batch_embed with single batch (no progress logging needed).""" texts = ["text_1", "text_2"] embeddings = await vector_store._batch_embed(texts, batch_size=100, log_progress=True) assert len(embeddings) == 2 class TestGetMainDefinitionLines: """Tests for get_main_definition_lines functionality.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) def test_get_main_definition_lines_empty_store(self, vector_store): """Test get_main_definition_lines on empty store.""" result = vector_store.get_main_definition_lines() assert result == {} async def test_get_main_definition_lines_with_functions(self, vector_store): """Test get_main_definition_lines with function chunks.""" chunks = [ CodeChunk( id="func1", file_path="src/main.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="main", content="def main(): pass", start_line=10, end_line=20, ), CodeChunk( id="func2", file_path="src/main.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="helper", content="def helper(): pass", start_line=25, end_line=30, ), ] await vector_store.create_or_update_table(chunks) result = vector_store.get_main_definition_lines() assert "src/main.py" in result # Should return the first (earliest) function assert result["src/main.py"] == (10, 20) async def test_get_main_definition_lines_with_classes(self, vector_store): """Test get_main_definition_lines with class chunks.""" chunks = [ CodeChunk( id="class1", file_path="src/models.py", language=Language.PYTHON, chunk_type=ChunkType.CLASS, name="User", content="class User: pass", start_line=5, end_line=50, ), ] await vector_store.create_or_update_table(chunks) result = vector_store.get_main_definition_lines() assert result["src/models.py"] == (5, 50) async def test_get_main_definition_lines_class_priority(self, vector_store): """Test that class takes priority over function if it starts earlier.""" chunks = [ CodeChunk( id="func1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="helper", content="def helper(): pass", start_line=20, end_line=25, ), CodeChunk( id="class1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.CLASS, name="MyClass", content="class MyClass: pass", start_line=5, end_line=15, ), ] await vector_store.create_or_update_table(chunks) result = vector_store.get_main_definition_lines() # Class starts earlier, so it should be returned assert result["src/module.py"] == (5, 15) async def test_get_main_definition_lines_function_first_when_earlier(self, vector_store): """Test that function is kept if it starts earlier than class.""" chunks = [ CodeChunk( id="func1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="early_func", content="def early_func(): pass", start_line=1, end_line=5, ), CodeChunk( id="class1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.CLASS, name="LaterClass", content="class LaterClass: pass", start_line=10, end_line=20, ), ] await vector_store.create_or_update_table(chunks) result = vector_store.get_main_definition_lines() # Function starts earlier assert result["src/module.py"] == (1, 5) async def test_get_main_definition_lines_multiple_files(self, vector_store): """Test get_main_definition_lines with multiple files.""" chunks = [ CodeChunk( id="func1", file_path="src/a.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="func_a", content="def func_a(): pass", start_line=10, end_line=20, ), CodeChunk( id="class1", file_path="src/b.py", language=Language.PYTHON, chunk_type=ChunkType.CLASS, name="ClassB", content="class ClassB: pass", start_line=5, end_line=50, ), ] await vector_store.create_or_update_table(chunks) result = vector_store.get_main_definition_lines() assert len(result) == 2 assert result["src/a.py"] == (10, 20) assert result["src/b.py"] == (5, 50) async def test_get_main_definition_lines_ignores_other_types(self, vector_store): """Test that get_main_definition_lines ignores module/import chunks.""" chunks = [ CodeChunk( id="module1", file_path="src/init.py", language=Language.PYTHON, chunk_type=ChunkType.MODULE, name="init", content="# module", start_line=1, end_line=5, ), ] await vector_store.create_or_update_table(chunks) result = vector_store.get_main_definition_lines() # Module chunks are not included assert result == {} async def test_get_main_definition_lines_same_type_keeps_earlier(self, vector_store): """Test that same type chunks keep the earlier one.""" chunks = [ CodeChunk( id="func1", file_path="src/funcs.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="late_func", content="def late_func(): pass", start_line=50, end_line=60, ), CodeChunk( id="func2", file_path="src/funcs.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="early_func", content="def early_func(): pass", start_line=10, end_line=20, ), ] await vector_store.create_or_update_table(chunks) result = vector_store.get_main_definition_lines() # Earlier function should be kept assert result["src/funcs.py"] == (10, 20) class TestChunkToText: """Tests for _chunk_to_text functionality.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) def test_chunk_to_text_with_parent_name(self, vector_store): """Test _chunk_to_text includes parent_name when present.""" chunk = CodeChunk( id="method1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="my_method", content="def my_method(self): pass", start_line=10, end_line=15, parent_name="MyClass", ) text = vector_store._chunk_to_text(chunk) assert "in MyClass" in text assert "my_method" in text assert "python" in text def test_chunk_to_text_with_docstring(self, vector_store): """Test _chunk_to_text includes docstring when present.""" chunk = CodeChunk( id="func1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="documented_func", content="def documented_func(): pass", start_line=1, end_line=5, docstring="This is the docstring for the function.", ) text = vector_store._chunk_to_text(chunk) assert "This is the docstring" in text assert "documented_func" in text def test_chunk_to_text_with_parent_and_docstring(self, vector_store): """Test _chunk_to_text with both parent_name and docstring.""" chunk = CodeChunk( id="method1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.FUNCTION, name="full_method", content="def full_method(self): return True", start_line=10, end_line=20, parent_name="ParentClass", docstring="Method docstring here.", ) text = vector_store._chunk_to_text(chunk) assert "in ParentClass" in text assert "Method docstring here" in text assert "full_method" in text assert "def full_method" in text def test_chunk_to_text_without_name(self, vector_store): """Test _chunk_to_text when name is None.""" chunk = CodeChunk( id="anon1", file_path="src/module.py", language=Language.PYTHON, chunk_type=ChunkType.MODULE, name=None, content="# Some module content", start_line=1, end_line=5, ) text = vector_store._chunk_to_text(chunk) assert "python" in text assert "# Some module content" in text class TestSanitizeStringValue: """Tests for _sanitize_string_value function.""" def test_sanitize_single_quote(self): """Test that single quotes are escaped.""" from local_deepwiki.core.vectorstore import _sanitize_string_value result = _sanitize_string_value("test'value") assert result == "test''value" def test_sanitize_multiple_quotes(self): """Test multiple single quotes are escaped.""" from local_deepwiki.core.vectorstore import _sanitize_string_value result = _sanitize_string_value("it's a 'test'") assert result == "it''s a ''test''" def test_sanitize_no_quotes(self): """Test string without quotes is unchanged.""" from local_deepwiki.core.vectorstore import _sanitize_string_value result = _sanitize_string_value("normal string") assert result == "normal string" class TestDeleteChunksByFilesEdgeCases: """Tests for delete_chunks_by_files edge cases.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) async def test_delete_chunks_by_files_empty_db(self, vector_store): """Test delete_chunks_by_files returns 0 when table doesn't exist.""" # Don't create any table, just try to delete result = await vector_store.delete_chunks_by_files(["file1.py", "file2.py"]) assert result == 0 class TestEnsureIndexesVectorIndexDetection: """Tests for vector index detection in _ensure_indexes.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) async def test_ensure_indexes_detects_ivf_index(self, vector_store): """Test _ensure_indexes detects IVF vector index.""" from unittest.mock import patch, MagicMock chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) # Mock list_indices to return an index with IVF type mock_index = MagicMock() mock_index.name = "vector_idx" mock_index.index_type = "IVF_PQ" with patch.object(vector_store._table, "list_indices", return_value=[mock_index]): with patch.object(vector_store._table, "create_index") as mock_create: with patch.object(vector_store._table, "count_rows", return_value=2000): vector_store._ensure_indexes() # Should NOT try to create vector index since IVF was detected mock_create.assert_not_called() async def test_ensure_indexes_detects_ivf_in_dict_index(self, vector_store): """Test _ensure_indexes detects IVF in dict-style index.""" from unittest.mock import patch chunks = [make_chunk("test_1")] await vector_store.create_or_update_table(chunks) # Mock list_indices to return dict-style index with IVF type mock_index = {"name": "vector_idx", "index_type": "ivf_flat"} with patch.object(vector_store._table, "list_indices", return_value=[mock_index]): with patch.object(vector_store._table, "create_index") as mock_create: with patch.object(vector_store._table, "count_rows", return_value=2000): vector_store._ensure_indexes() # Should NOT try to create vector index since IVF was detected mock_create.assert_not_called() class TestSearchCache: """Tests for search result caching functionality.""" @pytest.fixture def cache_config(self): """Create a search cache config for testing.""" return SearchCacheConfig( enabled=True, ttl_seconds=3600, max_entries=100, similarity_threshold=0.95, ) @pytest.fixture def fuzzy_config(self): """Create a fuzzy search config with auto-fuzzy disabled for caching tests.""" from local_deepwiki.config import FuzzySearchConfig return FuzzySearchConfig( enable_auto_fuzzy=False, # Disable so caching works with SemanticMockEmbeddingProvider ) @pytest.fixture def vector_store(self, tmp_path, cache_config, fuzzy_config): """Create a vector store with caching enabled.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" # Use semantic mock to get different embeddings for different queries provider = SemanticMockEmbeddingProvider() return VectorStore( db_path, provider, search_cache_config=cache_config, fuzzy_search_config=fuzzy_config ) @pytest.fixture async def populated_store(self, vector_store): """Create a vector store with test data. Note: create_or_update_table invalidates the cache once. """ chunks = [ make_chunk("func_1", content="def calculate_sum(a, b): return a + b"), make_chunk("func_2", content="def calculate_product(a, b): return a * b"), make_chunk("func_3", content="def parse_json(data): return json.loads(data)"), ] await vector_store.create_or_update_table(chunks) # Note: invalidations count is now 1 after fixture setup return vector_store async def test_search_cache_hit(self, populated_store): """Test that repeated identical searches return cached results.""" # First search - cache miss results1 = await populated_store.search("calculate") stats1 = populated_store.get_search_cache_stats() assert stats1["misses"] == 1 assert stats1["hits"] == 0 # Second search - cache hit results2 = await populated_store.search("calculate") stats2 = populated_store.get_search_cache_stats() assert stats2["misses"] == 1 assert stats2["hits"] == 1 # Results should be the same assert len(results1) == len(results2) for r1, r2 in zip(results1, results2): assert r1.chunk.id == r2.chunk.id async def test_search_cache_miss_different_query(self, populated_store): """Test that different queries with different embeddings result in cache misses.""" # First search - starts with 'c' await populated_store.search("calculate") stats1 = populated_store.get_search_cache_stats() assert stats1["misses"] == 1 # Different search starting with different letter - cache miss # SemanticMockEmbeddingProvider generates different embeddings based on first char await populated_store.search("parse json") stats2 = populated_store.get_search_cache_stats() assert stats2["misses"] == 2 async def test_search_cache_miss_different_filters(self, populated_store): """Test that same query with different filters results in cache miss.""" # Search without filters await populated_store.search("calculate") stats1 = populated_store.get_search_cache_stats() assert stats1["misses"] == 1 # Same query with language filter - cache miss await populated_store.search("calculate", language="python") stats2 = populated_store.get_search_cache_stats() assert stats2["misses"] == 2 async def test_search_cache_invalidated_on_create_or_update(self, populated_store): """Test that cache is invalidated when table is created/updated.""" # Note: populated_store fixture already triggered one invalidation # First search - cache miss await populated_store.search("calculate") stats1 = populated_store.get_search_cache_stats() assert stats1["entries"] == 1 initial_invalidations = stats1["invalidations"] # Create/update table - should invalidate cache new_chunks = [make_chunk("new_1", content="def new_function(): pass")] await populated_store.create_or_update_table(new_chunks) stats2 = populated_store.get_search_cache_stats() assert stats2["entries"] == 0 assert stats2["invalidations"] == initial_invalidations + 1 async def test_search_cache_invalidated_on_add_chunks(self, populated_store): """Test that cache is invalidated when chunks are added.""" # First search - cache miss await populated_store.search("calculate") stats1 = populated_store.get_search_cache_stats() assert stats1["entries"] == 1 initial_invalidations = stats1["invalidations"] # Add chunks - should invalidate cache new_chunks = [make_chunk("added_1", content="def added_function(): pass")] await populated_store.add_chunks(new_chunks) stats2 = populated_store.get_search_cache_stats() assert stats2["entries"] == 0 assert stats2["invalidations"] == initial_invalidations + 1 async def test_search_cache_invalidated_on_delete_chunks_by_file(self, populated_store): """Test that cache is invalidated when chunks are deleted by file.""" # First search - cache miss await populated_store.search("calculate") stats1 = populated_store.get_search_cache_stats() assert stats1["entries"] == 1 initial_invalidations = stats1["invalidations"] # Delete chunks - should invalidate cache await populated_store.delete_chunks_by_file("test.py") stats2 = populated_store.get_search_cache_stats() assert stats2["entries"] == 0 assert stats2["invalidations"] == initial_invalidations + 1 async def test_search_cache_invalidated_on_delete_chunks_by_files(self, populated_store): """Test that cache is invalidated when chunks are deleted by files.""" # First search - cache miss await populated_store.search("calculate") stats1 = populated_store.get_search_cache_stats() assert stats1["entries"] == 1 initial_invalidations = stats1["invalidations"] # Delete chunks - should invalidate cache await populated_store.delete_chunks_by_files(["test.py"]) stats2 = populated_store.get_search_cache_stats() assert stats2["entries"] == 0 assert stats2["invalidations"] == initial_invalidations + 1 async def test_invalidate_search_cache_method(self, populated_store): """Test the public invalidate_search_cache method.""" # Populate cache with different queries (different first chars = different embeddings) await populated_store.search("alpha query") await populated_store.search("beta query") stats1 = populated_store.get_search_cache_stats() assert stats1["entries"] == 2 # Invalidate count = populated_store.invalidate_search_cache() assert count == 2 stats2 = populated_store.get_search_cache_stats() assert stats2["entries"] == 0 async def test_search_cache_stats(self, populated_store): """Test get_search_cache_stats returns correct structure.""" stats = populated_store.get_search_cache_stats() assert "enabled" in stats assert "entries" in stats assert "max_entries" in stats assert "ttl_seconds" in stats assert "similarity_threshold" in stats assert "hits" in stats assert "misses" in stats assert "invalidations" in stats assert "hit_rate" in stats assert stats["enabled"] is True assert stats["max_entries"] == 100 assert stats["ttl_seconds"] == 3600 assert stats["similarity_threshold"] == 0.95 async def test_search_cache_not_used_for_fuzzy(self, populated_store): """Test that fuzzy searches don't use the cache.""" # Fuzzy search await populated_store.search("calculate", use_fuzzy=True) stats = populated_store.get_search_cache_stats() # Should not cache fuzzy results assert stats["entries"] == 0 async def test_search_cache_not_used_for_path_pattern(self, populated_store): """Test that path pattern searches don't use the cache.""" # Path pattern search await populated_store.search("calculate", path_pattern="src/**/*.py") stats = populated_store.get_search_cache_stats() # Should not cache path pattern results assert stats["entries"] == 0 class TestSearchCacheDisabled: """Tests for search caching when disabled.""" @pytest.fixture def disabled_config(self): """Create a disabled search cache config.""" return SearchCacheConfig(enabled=False) @pytest.fixture def vector_store(self, tmp_path, disabled_config): """Create a vector store with caching disabled.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider, search_cache_config=disabled_config) async def test_cache_disabled_no_caching(self, vector_store): """Test that caching is skipped when disabled.""" chunks = [make_chunk("func_1", content="def calculate(): pass")] await vector_store.create_or_update_table(chunks) # Search twice await vector_store.search("calculate") await vector_store.search("calculate") stats = vector_store.get_search_cache_stats() assert stats["enabled"] is False assert stats["entries"] == 0 assert stats["hits"] == 0 assert stats["misses"] == 0 class TestSearchCacheEviction: """Tests for search cache eviction.""" @pytest.fixture def vector_store_with_small_cache(self, tmp_path): """Create a vector store with small cache for testing eviction. We directly create a SearchCache with small max_entries to bypass the config validation that requires max_entries >= 100. """ from local_deepwiki.core.vectorstore import SearchCache, VectorStore db_path = tmp_path / "test.lance" provider = SemanticMockEmbeddingProvider() # Create VectorStore with default config first store = VectorStore(db_path, provider) # Replace the cache with a small one for testing (bypassing validation) # Create a config-like object that allows small max_entries class SmallCacheConfig: enabled = True ttl_seconds = 3600 max_entries = 3 # Small for testing similarity_threshold = 0.95 store._search_cache = SearchCache(SmallCacheConfig()) return store async def test_cache_eviction_when_over_capacity(self, vector_store_with_small_cache): """Test that old entries are evicted when cache exceeds max_entries.""" vector_store = vector_store_with_small_cache chunks = [ make_chunk("func_1", content="def alpha(): pass"), make_chunk("func_2", content="def beta(): pass"), make_chunk("func_3", content="def gamma(): pass"), make_chunk("func_4", content="def delta(): pass"), make_chunk("func_5", content="def epsilon(): pass"), ] await vector_store.create_or_update_table(chunks) # Fill cache beyond capacity (max is 3) await vector_store.search("alpha") await vector_store.search("beta") await vector_store.search("gamma") await vector_store.search("delta") # This should trigger eviction stats = vector_store.get_search_cache_stats() # Should have evicted some entries (max is 3, target is 80% = 2.4 -> 2) assert stats["entries"] <= 3 class TestSearchCacheTTL: """Tests for search cache TTL expiration.""" @pytest.fixture def vector_store_with_short_ttl(self, tmp_path): """Create a vector store with short TTL cache. We directly create a SearchCache with short TTL to bypass the config validation that requires ttl_seconds >= 60. """ from local_deepwiki.core.vectorstore import SearchCache, VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() # Create VectorStore with default config first store = VectorStore(db_path, provider) # Replace the cache with a short TTL one for testing class ShortTTLConfig: enabled = True ttl_seconds = 1 # 1 second TTL for testing max_entries = 1000 similarity_threshold = 0.95 store._search_cache = SearchCache(ShortTTLConfig()) return store async def test_cache_entry_expires_after_ttl(self, vector_store_with_short_ttl): """Test that cache entries expire after TTL.""" vector_store = vector_store_with_short_ttl chunks = [make_chunk("func_1", content="def calculate(): pass")] await vector_store.create_or_update_table(chunks) # First search - cache miss await vector_store.search("calculate") stats1 = vector_store.get_search_cache_stats() assert stats1["entries"] == 1 # Wait for TTL to expire time.sleep(1.5) # Second search - entry expired, should be cache miss await vector_store.search("calculate") stats2 = vector_store.get_search_cache_stats() # The expired entry should have been cleaned up assert stats2["misses"] == 2 class TestSearchCacheSemanticSimilarity: """Tests for semantic similarity matching in search cache.""" @pytest.fixture def cache_config(self): """Create a cache config with lower similarity threshold for testing.""" return SearchCacheConfig( enabled=True, ttl_seconds=3600, max_entries=100, similarity_threshold=0.9, # Lower threshold for testing ) @pytest.fixture def vector_store(self, tmp_path, cache_config): """Create a vector store with semantic caching.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() # Identical embeddings = similarity 1.0 return VectorStore(db_path, provider, search_cache_config=cache_config) async def test_semantic_cache_hit_identical_embeddings(self, vector_store): """Test that queries with identical embeddings result in cache hits.""" chunks = [make_chunk("func_1", content="def calculate(): pass")] await vector_store.create_or_update_table(chunks) # First search await vector_store.search("query1") stats1 = vector_store.get_search_cache_stats() assert stats1["misses"] == 1 assert stats1["hits"] == 0 # Second search with different text but identical embedding (from mock) await vector_store.search("query2") stats2 = vector_store.get_search_cache_stats() # Mock provider returns identical embeddings, so should be a cache hit assert stats2["hits"] == 1 class TestSearchCacheIntegration: """Integration tests for search cache with VectorStore.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store with default cache config.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() # Uses default SearchCacheConfig return VectorStore(db_path, provider) async def test_default_cache_config(self, vector_store): """Test that default cache config is applied.""" stats = vector_store.get_search_cache_stats() assert stats["enabled"] is True assert stats["ttl_seconds"] == 3600 # Default 1 hour assert stats["max_entries"] == 1000 # Default assert stats["similarity_threshold"] == 0.95 # Default async def test_cache_survives_empty_search(self, vector_store): """Test that caching works even with empty results.""" # Search empty store results = await vector_store.search("calculate") assert results == [] stats = vector_store.get_search_cache_stats() # Empty results should not be cached (no table exists) assert stats["entries"] == 0 async def test_cache_with_limit_filter(self, vector_store): """Test that different limits result in different cache entries.""" chunks = [ make_chunk("func_1", content="def calculate1(): pass"), make_chunk("func_2", content="def calculate2(): pass"), make_chunk("func_3", content="def calculate3(): pass"), ] await vector_store.create_or_update_table(chunks) # Search with default limit await vector_store.search("calculate") stats1 = vector_store.get_search_cache_stats() assert stats1["entries"] == 1 # Search with different limit - should be cache miss await vector_store.search("calculate", limit=5) stats2 = vector_store.get_search_cache_stats() assert stats2["entries"] == 2 assert stats2["misses"] == 2 class TestSearchCacheClass: """Direct tests for the SearchCache class.""" def test_compute_similarity_identical_vectors(self): """Test similarity computation for identical vectors.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) vec = [0.1, 0.2, 0.3, 0.4, 0.5] similarity = cache._compute_similarity(vec, vec) assert similarity == pytest.approx(1.0) def test_compute_similarity_orthogonal_vectors(self): """Test similarity computation for orthogonal vectors.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) vec1 = [1.0, 0.0, 0.0] vec2 = [0.0, 1.0, 0.0] similarity = cache._compute_similarity(vec1, vec2) assert similarity == pytest.approx(0.0) def test_compute_similarity_opposite_vectors(self): """Test similarity computation for opposite vectors.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) vec1 = [1.0, 1.0, 1.0] vec2 = [-1.0, -1.0, -1.0] similarity = cache._compute_similarity(vec1, vec2) assert similarity == pytest.approx(-1.0) def test_compute_similarity_zero_vector(self): """Test similarity computation with zero vector.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) vec1 = [0.0, 0.0, 0.0] vec2 = [1.0, 1.0, 1.0] similarity = cache._compute_similarity(vec1, vec2) assert similarity == 0.0 def test_filters_match_identical(self): """Test filters matching with identical filters.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) filters1 = {"language": "python", "limit": 10} filters2 = {"language": "python", "limit": 10} assert cache._filters_match(filters1, filters2) is True def test_filters_match_different(self): """Test filters matching with different filters.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) filters1 = {"language": "python", "limit": 10} filters2 = {"language": "typescript", "limit": 10} assert cache._filters_match(filters1, filters2) is False def test_filters_match_empty(self): """Test filters matching with empty filters.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) assert cache._filters_match({}, {}) is True assert cache._filters_match({"a": 1}, {}) is False def test_stats_returns_copy(self): """Test that stats returns a copy, not the internal dict.""" from local_deepwiki.core.vectorstore import SearchCache config = SearchCacheConfig() cache = SearchCache(config) stats1 = cache.stats stats1["hits"] = 999 stats2 = cache.stats assert stats2["hits"] == 0 # Internal stats not modified class TestParallelEmbedding: """Tests for parallel embedding generation.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) @pytest.fixture def slow_vector_store(self, tmp_path): """Create a vector store with slow embedding provider.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = SlowMockEmbeddingProvider(delay_seconds=0.05) config = EmbeddingBatchConfig(batch_size=2, concurrency=4) return VectorStore(db_path, provider, embedding_batch_config=config) async def test_parallel_embedding_basic(self, vector_store): """Test basic parallel embedding generation.""" texts = [f"text_{i}" for i in range(20)] embeddings = await vector_store._batch_embed(texts, batch_size=5) assert len(embeddings) == 20 assert all(len(e) == 384 for e in embeddings) async def test_parallel_embedding_preserves_order(self, vector_store): """Test that parallel embedding preserves input order.""" # Use distinctive texts so we can verify order texts = [f"unique_text_{i:04d}" for i in range(50)] embeddings = await vector_store._batch_embed(texts, batch_size=10) # All embeddings should be present assert len(embeddings) == 50 # Embeddings should be in same order as inputs # (with mock provider, all embeddings are identical, but count should match) provider = vector_store.embedding_provider total_embedded = sum(len(call) for call in provider.embed_calls) assert total_embedded == 50 async def test_parallel_embedding_faster_than_sequential(self, slow_vector_store): """Test that parallel embedding is faster than sequential.""" texts = [f"text_{i}" for i in range(10)] # 10 texts, 2 per batch = 5 batches # Time parallel execution start = time.time() await slow_vector_store._batch_embed(texts, batch_size=2) parallel_time = time.time() - start # Time sequential execution (for comparison) start = time.time() await slow_vector_store._batch_embed_sequential(texts, batch_size=2) sequential_time = time.time() - start # Parallel should be faster (at least 2x with 4 concurrent workers) # Allow some tolerance for test environment variations assert parallel_time < sequential_time * 0.8, ( f"Parallel ({parallel_time:.3f}s) should be faster than " f"sequential ({sequential_time:.3f}s)" ) async def test_parallel_embedding_concurrency_limited(self, tmp_path): """Test that concurrency is properly limited by semaphore.""" from local_deepwiki.core.vectorstore import VectorStore # Use API provider name to avoid automatic concurrency boost for local provider = SlowMockEmbeddingProvider(delay_seconds=0.1, name="openai:slow-mock") config = EmbeddingBatchConfig(batch_size=1, concurrency=2) # Only 2 concurrent store = VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) texts = [f"text_{i}" for i in range(4)] # 4 batches with concurrency 2 start = time.time() await store._batch_embed(texts, batch_size=1) elapsed = time.time() - start # With 4 batches and concurrency 2, should take ~0.2s (2 rounds of 0.1s each) # With concurrency 4, would take ~0.1s # Allow some margin assert elapsed >= 0.15, f"Expected >= 0.15s with concurrency 2, got {elapsed:.3f}s" async def test_parallel_embedding_empty_list(self, vector_store): """Test parallel embedding with empty list.""" embeddings = await vector_store._batch_embed([]) assert embeddings == [] async def test_parallel_embedding_single_text(self, vector_store): """Test parallel embedding with single text.""" embeddings = await vector_store._batch_embed(["single text"]) assert len(embeddings) == 1 assert len(embeddings[0]) == 384 async def test_parallel_embedding_single_batch(self, vector_store): """Test parallel embedding when all texts fit in single batch.""" texts = ["text_1", "text_2", "text_3"] embeddings = await vector_store._batch_embed(texts, batch_size=100) assert len(embeddings) == 3 async def test_parallel_embedding_with_progress_logging(self, vector_store): """Test parallel embedding with progress logging enabled.""" texts = [f"text_{i}" for i in range(30)] # This should complete without error with logging enabled embeddings = await vector_store._batch_embed(texts, batch_size=10, log_progress=True) assert len(embeddings) == 30 class TestParallelEmbeddingRetry: """Tests for parallel embedding retry logic.""" @pytest.fixture def failing_vector_store(self, tmp_path): """Create a vector store with failing provider.""" from local_deepwiki.core.vectorstore import VectorStore provider = FailingMockEmbeddingProvider(fail_count=2) config = EmbeddingBatchConfig( batch_size=5, concurrency=2, retry_max_attempts=3, retry_base_delay=0.1, ) return VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) async def test_retry_on_connection_error(self, failing_vector_store): """Test that connection errors trigger retry.""" texts = [f"text_{i}" for i in range(5)] # Should succeed after retries embeddings = await failing_vector_store._batch_embed(texts, batch_size=5) assert len(embeddings) == 5 # Provider should have been called multiple times due to retries provider = failing_vector_store.embedding_provider assert len(provider.embed_calls) >= 2 async def test_retry_exhausted_raises_error(self, tmp_path): """Test that exhausted retries raise RuntimeError.""" from local_deepwiki.core.vectorstore import VectorStore # Create provider that always fails provider = FailingMockEmbeddingProvider(fail_count=100) config = EmbeddingBatchConfig( batch_size=5, concurrency=1, retry_max_attempts=2, retry_base_delay=0.1, # Must be >= 0.1 ) store = VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) texts = [f"text_{i}" for i in range(5)] with pytest.raises(RuntimeError, match="Failed to embed"): await store._batch_embed(texts, batch_size=5) async def test_partial_failure_reports_errors(self, tmp_path): """Test that partial batch failures are properly reported.""" from local_deepwiki.core.vectorstore import VectorStore # Provider that fails on specific batches provider = FailingMockEmbeddingProvider( fail_count=100, # Never succeeds fail_on_batches={hash("batch_2_text_0")}, # Fail on second batch ) config = EmbeddingBatchConfig( batch_size=2, concurrency=2, retry_max_attempts=2, retry_base_delay=0.1, # Must be >= 0.1 ) store = VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) texts = ["batch_1_text_0", "batch_1_text_1", "batch_2_text_0", "batch_2_text_1"] with pytest.raises(RuntimeError, match="Failed to embed"): await store._batch_embed(texts, batch_size=2) class TestParallelEmbeddingRateLimiting: """Tests for rate limiting in parallel embedding.""" @pytest.fixture def rate_limited_store(self, tmp_path): """Create a vector store with rate limiting configured.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider(name="openai:test") config = EmbeddingBatchConfig( batch_size=2, concurrency=4, rate_limit_rpm=120, # 2 requests per second ) return VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) async def test_rate_limiter_throttles_requests(self, rate_limited_store): """Test that rate limiter properly throttles requests.""" texts = [f"text_{i}" for i in range(8)] # 4 batches start = time.time() await rate_limited_store._batch_embed(texts, batch_size=2) elapsed = time.time() - start # With 120 RPM (2/sec), 4 requests should take at least ~1.5 seconds # But since tokens accumulate, first batch may go through quickly # Just verify it took some time assert elapsed >= 0.0 # Basic sanity check async def test_rate_limiter_handles_api_errors(self, tmp_path): """Test that rate limit API errors trigger retry.""" from local_deepwiki.core.vectorstore import VectorStore provider = RateLimitMockEmbeddingProvider(rate_limit_after=2) config = EmbeddingBatchConfig( batch_size=2, concurrency=1, # Sequential to control order retry_max_attempts=3, retry_base_delay=0.1, ) store = VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) texts = [f"text_{i}" for i in range(6)] # 3 batches # Should succeed because rate limit error is retryable embeddings = await store._batch_embed(texts, batch_size=2) assert len(embeddings) == 6 class TestProviderTypeDetection: """Tests for provider type detection.""" def test_local_provider_detection(self, tmp_path): """Test detection of local provider.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider(name="local:all-MiniLM-L6-v2") store = VectorStore(tmp_path / "test.lance", provider) assert store._is_local_provider() is True def test_api_provider_detection(self, tmp_path): """Test detection of API provider.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider(name="openai:text-embedding-3-small") store = VectorStore(tmp_path / "test.lance", provider) assert store._is_local_provider() is False def test_optimal_config_for_local(self, tmp_path): """Test optimal config calculation for local provider.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider(name="local:test") config = EmbeddingBatchConfig(batch_size=50, concurrency=2) store = VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) batch_size, concurrency = store._get_optimal_batch_config() # Local provider should get larger batch size and higher concurrency assert batch_size >= 100 assert concurrency >= 4 def test_optimal_config_for_api(self, tmp_path): """Test optimal config calculation for API provider.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider(name="openai:test") config = EmbeddingBatchConfig(batch_size=200, concurrency=8) store = VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) batch_size, concurrency = store._get_optimal_batch_config() # API provider should get smaller batch size and lower concurrency assert batch_size <= 50 assert concurrency <= 4 class TestEmbeddingBatchConfig: """Tests for embedding batch configuration.""" def test_get_embedding_batch_config(self, tmp_path): """Test getting embedding batch configuration.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider(name="local:test") config = EmbeddingBatchConfig( batch_size=100, concurrency=4, rate_limit_rpm=60, retry_max_attempts=5, retry_base_delay=2.0, ) store = VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) batch_config = store.get_embedding_batch_config() assert batch_config["batch_size"] == 100 assert batch_config["concurrency"] == 4 assert batch_config["rate_limit_rpm"] == 60 assert batch_config["retry_max_attempts"] == 5 assert batch_config["retry_base_delay"] == 2.0 assert batch_config["is_local_provider"] is True assert "optimal_batch_size" in batch_config assert "optimal_concurrency" in batch_config def test_default_config(self, tmp_path): """Test default embedding batch configuration.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider() store = VectorStore(tmp_path / "test.lance", provider) batch_config = store.get_embedding_batch_config() # Check defaults from EmbeddingBatchConfig assert batch_config["batch_size"] == 100 assert batch_config["concurrency"] == 4 assert batch_config["rate_limit_rpm"] is None assert batch_config["retry_max_attempts"] == 3 assert batch_config["retry_base_delay"] == 1.0 class TestRateLimiter: """Tests for the RateLimiter class.""" async def test_rate_limiter_basic(self): """Test basic rate limiter functionality.""" from local_deepwiki.core.vectorstore import RateLimiter limiter = RateLimiter(requests_per_minute=600) # 10 per second # First few requests should be fast (tokens available) start = time.time() for _ in range(5): await limiter.acquire() elapsed = time.time() - start # Should be nearly instant with tokens available assert elapsed < 1.0 async def test_rate_limiter_throttles(self): """Test that rate limiter actually throttles.""" from local_deepwiki.core.vectorstore import RateLimiter limiter = RateLimiter(requests_per_minute=60) # 1 per second # Drain the initial tokens limiter.tokens = 0.0 start = time.time() await limiter.acquire() elapsed = time.time() - start # Should have waited ~1 second to refill assert elapsed >= 0.8 class TestEmbeddingProgress: """Tests for EmbeddingProgress tracking.""" def test_progress_update(self): """Test progress update functionality.""" from local_deepwiki.core.vectorstore import EmbeddingProgress progress = EmbeddingProgress(total_texts=100, total_batches=10) progress.update(success=True) assert progress.completed_batches == 1 assert progress.failed_batches == 0 progress.update(success=False) assert progress.completed_batches == 1 assert progress.failed_batches == 1 def test_progress_estimated_remaining(self): """Test estimated remaining time calculation.""" from local_deepwiki.core.vectorstore import EmbeddingProgress progress = EmbeddingProgress(total_texts=100, total_batches=10) # No completed batches yet assert progress.estimated_remaining_seconds is None # Simulate some progress progress.completed_batches = 5 progress.start_time = time.time() - 5.0 # 5 seconds elapsed # With 5 batches done in 5 seconds, remaining 5 batches should take ~5 seconds eta = progress.estimated_remaining_seconds assert eta is not None assert 4.0 <= eta <= 6.0 def test_progress_elapsed_time(self): """Test elapsed time calculation.""" from local_deepwiki.core.vectorstore import EmbeddingProgress progress = EmbeddingProgress(total_texts=100, total_batches=10) progress.start_time = time.time() - 2.5 elapsed = progress.elapsed_seconds assert 2.4 <= elapsed <= 2.6 class TestBatchEmbeddingResult: """Tests for BatchEmbeddingResult dataclass.""" def test_successful_result(self): """Test successful batch result.""" from local_deepwiki.core.vectorstore import BatchEmbeddingResult result = BatchEmbeddingResult( batch_index=0, embeddings=[[0.1, 0.2], [0.3, 0.4]], ) assert result.batch_index == 0 assert result.embeddings == [[0.1, 0.2], [0.3, 0.4]] assert result.error is None assert result.retry_count == 0 def test_failed_result(self): """Test failed batch result.""" from local_deepwiki.core.vectorstore import BatchEmbeddingResult error = ConnectionError("Test error") result = BatchEmbeddingResult( batch_index=1, embeddings=None, error=error, retry_count=3, ) assert result.batch_index == 1 assert result.embeddings is None assert result.error is error assert result.retry_count == 3 class TestParallelEmbeddingIntegration: """Integration tests for parallel embedding with full VectorStore operations.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for integration testing.""" from local_deepwiki.core.vectorstore import VectorStore provider = MockEmbeddingProvider(name="local:test") config = EmbeddingBatchConfig(batch_size=10, concurrency=4) return VectorStore(tmp_path / "test.lance", provider, embedding_batch_config=config) async def test_create_or_update_with_parallel_embedding(self, vector_store): """Test that create_or_update_table uses parallel embedding.""" chunks = [make_chunk(f"chunk_{i}") for i in range(50)] count = await vector_store.create_or_update_table(chunks) assert count == 50 stats = vector_store.get_stats() assert stats["total_chunks"] == 50 async def test_add_chunks_with_parallel_embedding(self, vector_store): """Test that add_chunks uses parallel embedding.""" # Create initial data initial_chunks = [make_chunk(f"initial_{i}") for i in range(10)] await vector_store.create_or_update_table(initial_chunks) # Add more chunks new_chunks = [make_chunk(f"new_{i}") for i in range(40)] count = await vector_store.add_chunks(new_chunks) assert count == 40 stats = vector_store.get_stats() assert stats["total_chunks"] == 50 async def test_search_after_parallel_indexing(self, vector_store): """Test search works correctly after parallel indexing.""" chunks = [ make_chunk(f"func_{i}", content=f"def function_{i}(): pass") for i in range(30) ] await vector_store.create_or_update_table(chunks) results = await vector_store.search("function", limit=5) assert len(results) > 0 assert all(r.chunk is not None for r in results) class TestLazyIndexManager: """Tests for LazyIndexManager and lazy vector index creation.""" @pytest.fixture def vector_store_lazy(self, tmp_path): """Create a vector store with lazy indexing enabled.""" from local_deepwiki.config import LazyIndexConfig from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() # Enable lazy indexing with low thresholds for testing lazy_config = LazyIndexConfig( enabled=True, latency_threshold_ms=100, min_rows=100, # Must be >= 100 per config validation latency_window_size=3, ) return VectorStore(db_path, provider, lazy_index_config=lazy_config) @pytest.fixture def vector_store_eager(self, tmp_path): """Create a vector store with lazy indexing disabled (eager mode).""" from local_deepwiki.config import LazyIndexConfig from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() lazy_config = LazyIndexConfig(enabled=False, min_rows=100) return VectorStore(db_path, provider, lazy_index_config=lazy_config) async def test_lazy_index_manager_initialized(self, vector_store_lazy): """Test that lazy index manager is properly initialized.""" assert vector_store_lazy._lazy_index_manager is not None assert vector_store_lazy._lazy_index_manager.config.enabled is True async def test_lazy_index_pending_after_create(self, vector_store_lazy): """Test that lazy indexing marks index as pending for large tables.""" # Create enough chunks to trigger index threshold (min_rows=100) chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store_lazy.create_or_update_table(chunks) # Index should be pending, not created assert vector_store_lazy._lazy_index_manager.is_index_pending() assert not vector_store_lazy._lazy_index_manager.is_index_ready() async def test_lazy_index_not_pending_for_small_tables(self, vector_store_lazy): """Test that lazy indexing doesn't mark pending for small tables.""" # Create fewer chunks than min_rows threshold chunks = [make_chunk(f"chunk_{i}") for i in range(50)] await vector_store_lazy.create_or_update_table(chunks) # Index should not be pending (too few rows) assert not vector_store_lazy._lazy_index_manager.is_index_pending() assert not vector_store_lazy._lazy_index_manager.is_index_ready() async def test_eager_index_created_immediately(self, vector_store_eager): """Test that eager indexing attempts to create index immediately for large tables.""" from unittest.mock import patch # Create enough chunks to trigger index threshold chunks = [make_chunk(f"chunk_{i}") for i in range(150)] # Patch create_index on the table to track if LanceDB index creation is called # Note: We need to first create the table, then patch await vector_store_eager.create_or_update_table(chunks) # In eager mode with enough rows, _create_vector_index should have been called # and mark_index_created should be true if index creation succeeded # (or it may fail due to test environment constraints) # The key thing is that in eager mode, we don't defer - we try immediately # Verify that lazy mode is disabled and no pending flag assert not vector_store_eager._lazy_index_manager.config.enabled # Index should either be created or we attempted it (not pending in lazy mode) assert not vector_store_eager._lazy_index_manager.is_index_pending() async def test_lazy_index_stats(self, vector_store_lazy): """Test get_lazy_index_stats returns correct information.""" stats = vector_store_lazy.get_lazy_index_stats() assert stats["enabled"] is True assert stats["index_pending"] is False assert stats["index_created"] is False assert stats["creation_in_progress"] is False assert stats["latency_threshold_ms"] == 100 assert stats["min_rows"] == 100 assert stats["average_latency_ms"] is None assert stats["latency_samples"] == 0 async def test_lazy_index_latency_tracking(self, vector_store_lazy): """Test that search latency is tracked for lazy index decisions.""" # Use fewer chunks (below threshold) so index isn't pending chunks = [make_chunk(f"chunk_{i}") for i in range(50)] await vector_store_lazy.create_or_update_table(chunks) # Perform some searches - use use_fuzzy=True to bypass cache # This ensures we always go through the full search path including latency tracking await vector_store_lazy.search("unique query alpha", use_fuzzy=True) await vector_store_lazy.search("unique query beta", use_fuzzy=True) await vector_store_lazy.search("unique query gamma", use_fuzzy=True) # Check latency was recorded stats = vector_store_lazy.get_lazy_index_stats() assert stats["latency_samples"] == 3 assert stats["average_latency_ms"] is not None assert stats["average_latency_ms"] >= 0 async def test_create_index_now(self, vector_store_lazy): """Test force immediate index creation.""" # Create enough chunks chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store_lazy.create_or_update_table(chunks) # Should be pending assert vector_store_lazy._lazy_index_manager.is_index_pending() # Force index creation - this may fail due to LanceDB internal reasons in tests # but we can verify the method is callable and updates state try: await vector_store_lazy.create_vector_index_now() # If successful, index should be ready assert vector_store_lazy.is_vector_index_ready() except (ValueError, RuntimeError): # LanceDB may complain about index already existing or other issues # The important thing is the method exists and handles errors gracefully pass async def test_is_vector_index_ready(self, vector_store_lazy): """Test is_vector_index_ready method.""" assert vector_store_lazy.is_vector_index_ready() is False chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store_lazy.create_or_update_table(chunks) # Still not ready (lazy mode) assert vector_store_lazy.is_vector_index_ready() is False async def test_on_vector_index_ready_callback(self, vector_store_lazy): """Test callback registration for index ready event.""" callback_called = [] def my_callback(): callback_called.append(True) # Register callback vector_store_lazy.on_vector_index_ready(my_callback) # Callback shouldn't be called yet assert len(callback_called) == 0 # Manually mark index as created to trigger callback vector_store_lazy._lazy_index_manager.mark_index_created() # Callback should have been called assert len(callback_called) == 1 async def test_lazy_index_manager_reset(self, vector_store_lazy): """Test that reset clears all state.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store_lazy.create_or_update_table(chunks) # Record some latency vector_store_lazy._lazy_index_manager.record_search_latency(100.0) # Now recreate the table (which calls reset internally) await vector_store_lazy.create_or_update_table(chunks) # State should be fresh (only pending flag set for large table) stats = vector_store_lazy.get_lazy_index_stats() assert stats["index_pending"] is True # Set during create assert stats["index_created"] is False assert stats["latency_samples"] == 0 # Reset clears latency class TestLazyIndexLatencyTrigger: """Tests for on-demand index creation triggered by latency.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store with low latency threshold for testing.""" from local_deepwiki.config import LazyIndexConfig from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() lazy_config = LazyIndexConfig( enabled=True, latency_threshold_ms=50, # Minimum allowed per config validation min_rows=100, # Must be >= 100 per config validation latency_window_size=3, ) return VectorStore(db_path, provider, lazy_index_config=lazy_config) async def test_should_create_index_based_on_latency(self, vector_store): """Test that should_create_index returns True when latency exceeds threshold.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) manager = vector_store._lazy_index_manager # Index is pending (table large enough) assert manager.is_index_pending() # should_create_index should return True because it's pending assert manager.should_create_index() async def test_should_not_create_when_disabled(self, tmp_path): """Test that should_create_index returns False when disabled.""" from local_deepwiki.config import LazyIndexConfig from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "disabled.lance" provider = MockEmbeddingProvider() lazy_config = LazyIndexConfig(enabled=False) store = VectorStore(db_path, provider, lazy_index_config=lazy_config) chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await store.create_or_update_table(chunks) # Even with enough data, should return False when disabled assert not store._lazy_index_manager.should_create_index() async def test_should_not_create_when_already_created(self, vector_store): """Test that should_create_index returns False after creation.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) manager = vector_store._lazy_index_manager # Mark as created manager.mark_index_created() # Should not create again assert not manager.should_create_index() class TestLatencyStats: """Tests for LatencyStats helper class.""" def test_record_and_get_average(self): """Test recording latencies and computing average.""" from local_deepwiki.core.vectorstore import LatencyStats stats = LatencyStats(window_size=5) # Record some values stats.record(100.0) stats.record(200.0) stats.record(300.0) assert stats.get_count() == 3 assert stats.get_average() == 200.0 def test_window_size_limit(self): """Test that window size is respected.""" from local_deepwiki.core.vectorstore import LatencyStats stats = LatencyStats(window_size=3) # Record more values than window size for i in range(10): stats.record(float(i * 100)) # Should only keep last 3 assert stats.get_count() == 3 # Last 3 values: 700, 800, 900 assert stats.get_average() == 800.0 def test_empty_stats(self): """Test empty stats return None for average.""" from local_deepwiki.core.vectorstore import LatencyStats stats = LatencyStats() assert stats.get_count() == 0 assert stats.get_average() is None def test_clear(self): """Test clearing stats.""" from local_deepwiki.core.vectorstore import LatencyStats stats = LatencyStats() stats.record(100.0) stats.record(200.0) stats.clear() assert stats.get_count() == 0 assert stats.get_average() is None class TestLazyIndexScheduling: """Tests for background index creation scheduling.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.config import LazyIndexConfig from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() lazy_config = LazyIndexConfig( enabled=True, min_rows=100, # Must be >= 100 per config validation latency_threshold_ms=100, ) return VectorStore(db_path, provider, lazy_index_config=lazy_config) async def test_schedule_index_creation(self, vector_store): """Test scheduling index creation as background task.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) manager = vector_store._lazy_index_manager # Should be pending assert manager.is_index_pending() # Schedule creation await vector_store.schedule_lazy_index_creation() # Either creation is in progress or already done assert manager.is_creation_in_progress() or manager.is_index_ready() async def test_wait_for_index_timeout(self, vector_store): """Test wait_for_index with timeout.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) # Don't create the index, just wait with a very short timeout result = await vector_store.wait_for_vector_index(timeout=0.01) # Should return False (timed out) assert result is False async def test_wait_for_index_immediate_ready(self, vector_store): """Test wait_for_index returns immediately when index is ready.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) # Mark as ready vector_store._lazy_index_manager.mark_index_created() # Should return immediately result = await vector_store.wait_for_vector_index(timeout=0.1) assert result is True async def test_duplicate_schedule_is_noop(self, vector_store): """Test that scheduling twice doesn't create duplicate tasks.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) # Schedule twice await vector_store.schedule_lazy_index_creation() await vector_store.schedule_lazy_index_creation() # Should work without errors manager = vector_store._lazy_index_manager assert manager.is_creation_in_progress() or manager.is_index_ready() class TestLazyIndexIntegration: """Integration tests for lazy index with full workflow.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store with lazy indexing.""" from local_deepwiki.config import LazyIndexConfig from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "integration.lance" provider = MockEmbeddingProvider() lazy_config = LazyIndexConfig( enabled=True, min_rows=100, # Must be >= 100 per config validation latency_threshold_ms=500, ) return VectorStore(db_path, provider, lazy_index_config=lazy_config) async def test_search_works_without_index(self, vector_store): """Test that search works correctly even without vector index (brute force).""" chunks = [ make_chunk(f"chunk_{i}", content=f"content number {i}") for i in range(150) ] await vector_store.create_or_update_table(chunks) # Index is pending (lazy mode) assert vector_store._lazy_index_manager.is_index_pending() assert not vector_store.is_vector_index_ready() # Search should still work (brute force) results = await vector_store.search("content", limit=5) assert len(results) > 0 assert all(r.chunk is not None for r in results) async def test_full_workflow_with_lazy_index(self, vector_store): """Test complete workflow: create, search, create index, search again.""" # 1. Create data chunks = [ make_chunk(f"func_{i}", content=f"def function_{i}(): pass") for i in range(150) ] await vector_store.create_or_update_table(chunks) # 2. Search (without index) results1 = await vector_store.search("function", limit=5) assert len(results1) > 0 # 3. Check stats stats = vector_store.get_lazy_index_stats() assert stats["index_pending"] is True assert stats["latency_samples"] == 1 # One search recorded # 4. Try to create index now try: await vector_store.create_vector_index_now() except (ValueError, RuntimeError): # May fail in test environment, that's OK pass # 5. Search again results2 = await vector_store.search("function", limit=5) assert len(results2) > 0 async def test_callback_invoked_on_index_ready(self, vector_store): """Test that registered callbacks are invoked when index becomes ready.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) callback_data = {"called": False, "call_count": 0} def my_callback(): callback_data["called"] = True callback_data["call_count"] += 1 # Register callback vector_store.on_vector_index_ready(my_callback) # Not called yet assert not callback_data["called"] # Manually trigger index ready vector_store._lazy_index_manager.mark_index_created() # Should be called now assert callback_data["called"] assert callback_data["call_count"] == 1 async def test_callback_immediate_if_already_ready(self, vector_store): """Test that callback is invoked immediately if index is already ready.""" chunks = [make_chunk(f"chunk_{i}") for i in range(150)] await vector_store.create_or_update_table(chunks) # Mark as ready first vector_store._lazy_index_manager.mark_index_created() callback_data = {"called": False} def my_callback(): callback_data["called"] = True # Register callback after index is ready vector_store.on_vector_index_ready(my_callback) # Should be called immediately assert callback_data["called"] class TestSearchProfiles: """Tests for configurable search profiles (precision/recall trade-off).""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import SearchProfile, VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) @pytest.fixture async def populated_store(self, vector_store): """Create a vector store with test data.""" # Create chunks with varying content to test similarity filtering chunks = [ make_chunk("chunk_1", "src/auth.py", "def authenticate_user(): pass"), make_chunk("chunk_2", "src/auth.py", "def validate_token(): pass"), make_chunk("chunk_3", "src/db.py", "def connect_database(): pass"), make_chunk("chunk_4", "src/api.py", "def handle_request(): pass"), make_chunk("chunk_5", "tests/test_auth.py", "def test_auth(): pass"), ] await vector_store.create_or_update_table(chunks) return vector_store async def test_search_with_fast_profile(self, populated_store): """Test searching with FAST profile.""" from local_deepwiki.core.vectorstore import SearchProfile results = await populated_store.search( "authenticate", limit=5, profile=SearchProfile.FAST ) # FAST profile has higher min_similarity threshold (0.3) # With mock embeddings, all results have same similarity assert isinstance(results, list) # Results should be returned successfully for r in results: assert r.chunk is not None async def test_search_with_balanced_profile(self, populated_store): """Test searching with BALANCED profile (default).""" from local_deepwiki.core.vectorstore import SearchProfile results = await populated_store.search( "authenticate", limit=5, profile=SearchProfile.BALANCED ) assert isinstance(results, list) for r in results: assert r.chunk is not None async def test_search_with_thorough_profile(self, populated_store): """Test searching with THOROUGH profile.""" from local_deepwiki.core.vectorstore import SearchProfile results = await populated_store.search( "authenticate", limit=5, profile=SearchProfile.THOROUGH ) # THOROUGH profile has lower min_similarity threshold (0.1) # Should return more results with lower threshold assert isinstance(results, list) for r in results: assert r.chunk is not None async def test_search_with_string_profile(self, populated_store): """Test searching with profile as string.""" # Test string profile names results_fast = await populated_store.search("test", limit=5, profile="fast") results_balanced = await populated_store.search("test", limit=5, profile="balanced") results_thorough = await populated_store.search("test", limit=5, profile="thorough") assert isinstance(results_fast, list) assert isinstance(results_balanced, list) assert isinstance(results_thorough, list) async def test_search_with_invalid_profile_string(self, populated_store): """Test searching with invalid profile string falls back to default.""" # Invalid profile should fall back to default without raising results = await populated_store.search("test", limit=5, profile="invalid_profile") assert isinstance(results, list) async def test_search_with_min_similarity_override(self, populated_store): """Test that min_similarity parameter overrides profile default.""" from local_deepwiki.core.vectorstore import SearchProfile # Use FAST profile (default min_similarity=0.3) but override to 0.01 # This should allow more results through results = await populated_store.search( "test", limit=10, profile=SearchProfile.FAST, min_similarity=0.01, ) assert isinstance(results, list) # With very low threshold, should get all chunks assert len(results) <= 10 async def test_search_high_min_similarity_filters_results(self, tmp_path): """Test that high min_similarity threshold filters out low-scoring results.""" from local_deepwiki.core.vectorstore import VectorStore # Use semantic mock that returns different embeddings provider = SemanticMockEmbeddingProvider() store = VectorStore(tmp_path / "test.lance", provider) chunks = [ make_chunk("chunk_1", content="authentication login"), make_chunk("chunk_2", content="completely unrelated content xyz"), ] await store.create_or_update_table(chunks) # With very high threshold, should filter out low-scoring results results = await store.search( "authentication", limit=10, min_similarity=0.99 ) # High threshold may filter out all results depending on embeddings assert isinstance(results, list) async def test_default_profile_configuration(self, tmp_path): """Test that default profile can be configured at construction.""" from local_deepwiki.core.vectorstore import SearchProfile, VectorStore provider = MockEmbeddingProvider() # Create store with FAST as default store = VectorStore( tmp_path / "test.lance", provider, default_search_profile=SearchProfile.FAST, ) assert store.get_search_profile() == SearchProfile.FAST # Create store with THOROUGH as default store2 = VectorStore( tmp_path / "test2.lance", provider, default_search_profile=SearchProfile.THOROUGH, ) assert store2.get_search_profile() == SearchProfile.THOROUGH async def test_set_search_profile(self, vector_store): """Test setting search profile at runtime.""" from local_deepwiki.core.vectorstore import SearchProfile # Default should be BALANCED assert vector_store.get_search_profile() == SearchProfile.BALANCED # Set to FAST vector_store.set_search_profile(SearchProfile.FAST) assert vector_store.get_search_profile() == SearchProfile.FAST # Set using string vector_store.set_search_profile("thorough") assert vector_store.get_search_profile() == SearchProfile.THOROUGH async def test_set_search_profile_invalid_string(self, vector_store): """Test setting invalid profile string raises ValueError.""" with pytest.raises(ValueError, match="Invalid search profile"): vector_store.set_search_profile("invalid") class TestAdaptiveSearch: """Tests for adaptive search depth estimation.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) @pytest.fixture async def populated_store(self, vector_store): """Create a vector store with test data.""" chunks = [make_chunk(f"chunk_{i}", content=f"test content {i}") for i in range(20)] await vector_store.create_or_update_table(chunks) return vector_store async def test_adaptive_search_enabled_by_default(self, vector_store): """Test that adaptive search is enabled by default.""" assert vector_store.get_adaptive_search_enabled() is True async def test_disable_adaptive_search(self, vector_store): """Test disabling adaptive search.""" vector_store.set_adaptive_search_enabled(False) assert vector_store.get_adaptive_search_enabled() is False async def test_adaptive_search_estimates_depth(self, populated_store): """Test that adaptive searcher estimates optimal depth.""" # Access the internal adaptive searcher searcher = populated_store._adaptive_searcher # Simple query should have lower complexity simple_depth = searcher.estimate_optimal_depth("test", base_limit=10) assert simple_depth >= 10 # Complex query should have higher complexity complex_depth = searcher.estimate_optimal_depth( "authentication middleware handler controller service", base_limit=10, ) assert complex_depth >= simple_depth async def test_adaptive_search_records_quality(self, populated_store): """Test that search quality is recorded for adaptation.""" # Perform a search await populated_store.search("test content") # Check that stats show recorded queries stats = populated_store.get_adaptive_search_stats() assert stats["query_history_size"] >= 1 async def test_adaptive_search_disabled_does_not_record(self, populated_store): """Test that disabled adaptive search doesn't record quality.""" # Disable adaptive search populated_store.set_adaptive_search_enabled(False) # Perform a search await populated_store.search("test content") # Quality should not be recorded (though history may still grow) # The key thing is searches still work stats = populated_store.get_adaptive_search_stats() assert "adaptive_search_enabled" in stats assert stats["adaptive_search_enabled"] is False class TestSearchFeedback: """Tests for search feedback system.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) @pytest.fixture async def populated_store(self, vector_store): """Create a vector store with test data.""" chunks = [make_chunk(f"chunk_{i}", content=f"test content {i}") for i in range(10)] await vector_store.create_or_update_table(chunks) return vector_store async def test_record_feedback(self, populated_store): """Test recording user feedback on search results.""" from local_deepwiki.core.vectorstore import SearchFeedback # Perform a search first results = await populated_store.search("test") assert len(results) > 0 # Record feedback for the first result feedback = SearchFeedback( query="test", result_id=results[0].chunk.id, relevant=True, ) populated_store.record_feedback(feedback) # Check feedback stats stats = populated_store.get_adaptive_search_stats() assert stats["feedback_stats"]["total_feedback"] == 1 assert stats["feedback_stats"]["relevant_count"] == 1 async def test_record_multiple_feedback(self, populated_store): """Test recording multiple feedback entries.""" from local_deepwiki.core.vectorstore import SearchFeedback results = await populated_store.search("test") # Record multiple feedback populated_store.record_feedback( SearchFeedback(query="test", result_id="chunk_0", relevant=True) ) populated_store.record_feedback( SearchFeedback(query="test", result_id="chunk_1", relevant=False) ) populated_store.record_feedback( SearchFeedback(query="test", result_id="chunk_2", relevant=True) ) stats = populated_store.get_adaptive_search_stats() assert stats["feedback_stats"]["total_feedback"] == 3 assert stats["feedback_stats"]["relevant_count"] == 2 assert stats["feedback_stats"]["irrelevant_count"] == 1 assert stats["feedback_stats"]["relevance_rate"] == pytest.approx(2 / 3) async def test_feedback_stats_empty(self, vector_store): """Test feedback stats when no feedback recorded.""" stats = vector_store.get_adaptive_search_stats() assert stats["feedback_stats"]["total_feedback"] == 0 assert stats["feedback_stats"]["relevance_rate"] == 0.0 class TestSearchProfilesWithPagination: """Tests for search profiles with pagination.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) @pytest.fixture async def populated_store(self, vector_store): """Create a vector store with test data.""" chunks = [make_chunk(f"chunk_{i}", content=f"test content {i}") for i in range(50)] await vector_store.create_or_update_table(chunks) return vector_store async def test_paginated_search_with_fast_profile(self, populated_store): """Test paginated search with FAST profile.""" from local_deepwiki.core.vectorstore import SearchProfile page = await populated_store.search_paginated( "test", limit=10, offset=0, profile=SearchProfile.FAST ) assert len(page.results) <= 10 assert page.offset == 0 assert page.limit == 10 async def test_paginated_search_with_thorough_profile(self, populated_store): """Test paginated search with THOROUGH profile.""" from local_deepwiki.core.vectorstore import SearchProfile page = await populated_store.search_paginated( "test", limit=10, offset=0, profile=SearchProfile.THOROUGH ) assert len(page.results) <= 10 # THOROUGH should search more candidates assert page.total >= 0 async def test_paginated_search_min_similarity_override(self, populated_store): """Test paginated search with min_similarity override.""" # Very high threshold should filter results page = await populated_store.search_paginated( "test", limit=10, offset=0, min_similarity=0.99, ) # Results may be filtered due to high threshold assert isinstance(page.results, list) async def test_paginated_search_profile_string(self, populated_store): """Test paginated search with profile as string.""" page = await populated_store.search_paginated( "test", limit=10, profile="balanced" ) assert isinstance(page.results, list) class TestAdaptiveSearcherUnit: """Unit tests for the AdaptiveSearcher class.""" def test_query_complexity_empty(self): """Test complexity calculation for empty query.""" from local_deepwiki.core.vectorstore import AdaptiveSearcher searcher = AdaptiveSearcher() complexity = searcher._calculate_query_complexity("") assert complexity == 0.0 def test_query_complexity_simple(self): """Test complexity calculation for simple query.""" from local_deepwiki.core.vectorstore import AdaptiveSearcher searcher = AdaptiveSearcher() complexity = searcher._calculate_query_complexity("test") assert 0.0 <= complexity <= 1.0 def test_query_complexity_technical(self): """Test that technical terms increase complexity.""" from local_deepwiki.core.vectorstore import AdaptiveSearcher searcher = AdaptiveSearcher() # Query with technical terms tech_complexity = searcher._calculate_query_complexity( "function authentication middleware" ) # Query without technical terms simple_complexity = searcher._calculate_query_complexity( "hello world foo" ) # Technical query should have higher complexity assert tech_complexity > simple_complexity def test_query_complexity_caching(self): """Test that complexity calculations are cached.""" from local_deepwiki.core.vectorstore import AdaptiveSearcher searcher = AdaptiveSearcher() query = "test query" # First call - should compute complexity1 = searcher._calculate_query_complexity(query) # Should be cached now assert query in searcher._complexity_cache assert searcher._complexity_cache[query] == complexity1 # Second call - should use cache complexity2 = searcher._calculate_query_complexity(query) assert complexity1 == complexity2 def test_estimate_optimal_depth_minimum(self): """Test that optimal depth is at least the base limit.""" from local_deepwiki.core.vectorstore import AdaptiveSearcher searcher = AdaptiveSearcher() base_limit = 10 depth = searcher.estimate_optimal_depth("test", base_limit=base_limit) assert depth >= base_limit def test_estimate_optimal_depth_maximum(self): """Test that optimal depth doesn't exceed 10x base limit.""" from local_deepwiki.core.vectorstore import AdaptiveSearcher searcher = AdaptiveSearcher() base_limit = 10 depth = searcher.estimate_optimal_depth( "very complex authentication middleware handler controller", base_limit=base_limit, ) assert depth <= base_limit * 10 def test_record_search_quality_clamps_values(self): """Test that quality values are clamped to valid range.""" from local_deepwiki.core.vectorstore import AdaptiveSearcher searcher = AdaptiveSearcher() # Record with quality out of range searcher.record_search_quality("test", quality=1.5, result_count=5, depth_used=20) searcher.record_search_quality("test2", quality=-0.5, result_count=5, depth_used=20) # Check that values were clamped assert len(searcher._query_history) == 2 # Quality should be clamped to 1.0 and 0.0 respectively assert searcher._query_history[0][1] == 1.0 assert searcher._query_history[1][1] == 0.0 class TestSearchProfileConfig: """Tests for search profile configuration.""" def test_profile_config_values(self): """Test that profile configs have expected values.""" from local_deepwiki.core.vectorstore import ( SEARCH_PROFILES, SearchProfile, SearchProfileConfig, ) # FAST profile should have lower fetch multiplier fast_config = SEARCH_PROFILES[SearchProfile.FAST] assert fast_config.fetch_multiplier == 1.0 assert fast_config.rerank_candidates == 10 assert fast_config.use_approximate is True assert fast_config.min_similarity == 0.3 # BALANCED profile balanced_config = SEARCH_PROFILES[SearchProfile.BALANCED] assert balanced_config.fetch_multiplier == 2.0 assert balanced_config.rerank_candidates == 50 assert balanced_config.use_approximate is True assert balanced_config.min_similarity == 0.2 # THOROUGH profile should have highest fetch multiplier thorough_config = SEARCH_PROFILES[SearchProfile.THOROUGH] assert thorough_config.fetch_multiplier == 5.0 assert thorough_config.rerank_candidates == 200 assert thorough_config.use_approximate is False assert thorough_config.min_similarity == 0.1 def test_profile_enum_values(self): """Test SearchProfile enum values.""" from local_deepwiki.core.vectorstore import SearchProfile assert SearchProfile.FAST.value == "fast" assert SearchProfile.BALANCED.value == "balanced" assert SearchProfile.THOROUGH.value == "thorough" def test_profile_enum_from_string(self): """Test creating SearchProfile from string.""" from local_deepwiki.core.vectorstore import SearchProfile assert SearchProfile("fast") == SearchProfile.FAST assert SearchProfile("balanced") == SearchProfile.BALANCED assert SearchProfile("thorough") == SearchProfile.THOROUGH # Invalid string should raise ValueError with pytest.raises(ValueError): SearchProfile("invalid") class TestFuzzySearchHelper: """Tests for FuzzySearchHelper class.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() return VectorStore(db_path, provider) @pytest.fixture async def populated_store_with_names(self, vector_store): """Create a vector store with chunks that have meaningful names.""" chunks = [ make_chunk("func_1", content="def calculate_sum(a, b): return a + b"), make_chunk("func_2", content="def calculate_product(a, b): return a * b"), make_chunk("func_3", content="def calculate_difference(a, b): return a - b"), make_chunk("class_1", content="class UserManager: pass", chunk_type=ChunkType.CLASS), make_chunk("class_2", content="class UserService: pass", chunk_type=ChunkType.CLASS), make_chunk("method_1", content="def get_user(self): pass", chunk_type=ChunkType.METHOD), ] # Override the names in chunks chunks[0].name = "calculate_sum" chunks[1].name = "calculate_product" chunks[2].name = "calculate_difference" chunks[3].name = "UserManager" chunks[4].name = "UserService" chunks[5].name = "get_user" chunks[5].parent_name = "UserService" await vector_store.create_or_update_table(chunks) return vector_store async def test_build_name_index(self, populated_store_with_names): """Test building the fuzzy name index.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(populated_store_with_names) await helper.build_name_index() assert helper.is_built stats = helper.get_stats() assert stats["total_names"] > 0 assert stats["unique_names"] > 0 async def test_find_similar_names_exact_match(self, populated_store_with_names): """Test finding similar names with exact match.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(populated_store_with_names) await helper.build_name_index() results = helper.find_similar_names("calculate_sum", threshold=0.6) assert len(results) > 0 # Exact match should have high score names = [name for name, score in results] assert "calculate_sum" in names async def test_find_similar_names_typo(self, populated_store_with_names): """Test finding similar names with typo.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(populated_store_with_names) await helper.build_name_index() # Search with typo "calcluate" instead of "calculate" results = helper.find_similar_names("calcluate_sum", threshold=0.5) assert len(results) > 0 # Should find calculate_sum despite typo names = [name for name, score in results] assert any("calculate" in name for name in names) async def test_find_similar_names_threshold(self, populated_store_with_names): """Test that threshold filters out low-similarity results.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(populated_store_with_names) await helper.build_name_index() # High threshold should filter more results high_threshold_results = helper.find_similar_names("xyz_random", threshold=0.9) low_threshold_results = helper.find_similar_names("xyz_random", threshold=0.3) # High threshold should have fewer or equal results assert len(high_threshold_results) <= len(low_threshold_results) async def test_find_similar_names_limit(self, populated_store_with_names): """Test that limit parameter works.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(populated_store_with_names) await helper.build_name_index() results = helper.find_similar_names("calculate", threshold=0.3, limit=2) assert len(results) <= 2 async def test_generate_suggestions(self, populated_store_with_names): """Test generating suggestions for poor results.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(populated_store_with_names) await helper.build_name_index() # Empty results should trigger suggestions empty_results: list = [] suggestions = helper.generate_suggestions("calcluate", empty_results, threshold=0.5) # Should suggest names containing "calculate" assert len(suggestions) > 0 async def test_generate_suggestions_excludes_existing(self, populated_store_with_names): """Test that suggestions exclude names already in results.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(populated_store_with_names) await helper.build_name_index() # Create a mock result with calculate_sum from local_deepwiki.models import SearchResult mock_chunk = make_chunk("test") mock_chunk.name = "calculate_sum" mock_results = [SearchResult(chunk=mock_chunk, score=0.3, highlights=[])] suggestions = helper.generate_suggestions("calculate", mock_results, threshold=0.5) # Should not include calculate_sum since it's already in results assert "calculate_sum" not in suggestions async def test_empty_store_name_index(self, vector_store): """Test building name index on empty store.""" from local_deepwiki.core.fuzzy_search import FuzzySearchHelper helper = FuzzySearchHelper(vector_store) await helper.build_name_index() assert helper.is_built stats = helper.get_stats() assert stats["total_names"] == 0 class TestAutoFuzzySearch: """Tests for automatic fuzzy search fallback.""" @pytest.fixture def vector_store(self, tmp_path): """Create a vector store for testing.""" from local_deepwiki.config import FuzzySearchConfig from local_deepwiki.core.vectorstore import VectorStore db_path = tmp_path / "test.lance" provider = MockEmbeddingProvider() fuzzy_config = FuzzySearchConfig( auto_fuzzy_threshold=0.5, suggestion_threshold=0.5, max_suggestions=3, enable_auto_fuzzy=True, ) return VectorStore(db_path, provider, fuzzy_search_config=fuzzy_config) @pytest.fixture async def populated_store(self, vector_store): """Create a populated vector store.""" chunks = [ make_chunk("func_1", content="def calculate_sum(a, b): return a + b"), make_chunk("func_2", content="def calculate_product(a, b): return a * b"), ] chunks[0].name = "calculate_sum" chunks[1].name = "calculate_product" await vector_store.create_or_update_table(chunks) return vector_store async def test_should_auto_enable_fuzzy_empty_results(self): """Test that auto-fuzzy is enabled for empty results.""" from local_deepwiki.core.fuzzy_search import should_auto_enable_fuzzy assert should_auto_enable_fuzzy([], threshold=0.5) is True async def test_should_auto_enable_fuzzy_low_scores(self): """Test that auto-fuzzy is enabled for low-scoring results.""" from local_deepwiki.core.fuzzy_search import should_auto_enable_fuzzy from local_deepwiki.models import SearchResult mock_chunk = make_chunk("test") results = [SearchResult(chunk=mock_chunk, score=0.3, highlights=[])] assert should_auto_enable_fuzzy(results, threshold=0.5) is True async def test_should_not_auto_enable_fuzzy_high_scores(self): """Test that auto-fuzzy is not enabled for high-scoring results.""" from local_deepwiki.core.fuzzy_search import should_auto_enable_fuzzy from local_deepwiki.models import SearchResult mock_chunk = make_chunk("test") results = [SearchResult(chunk=mock_chunk, score=0.8, highlights=[])] assert should_auto_enable_fuzzy(results, threshold=0.5) is False async def test_search_with_auto_suggest(self, populated_store): """Test search with auto_suggest enabled.""" # Search for something that will return results results = await populated_store.search("calculate", auto_suggest=True) # Should return results (our mock embeddings give same vectors so scores are similar) assert len(results) > 0 async def test_search_without_auto_suggest(self, populated_store): """Test search with auto_suggest disabled.""" results = await populated_store.search("calculate", auto_suggest=False) # Should return results without suggestions assert len(results) > 0 # First result should not have suggestions when auto_suggest=False # (though this depends on result quality) class TestFuzzySearchConfig: """Tests for FuzzySearchConfig.""" def test_default_config(self): """Test default fuzzy search configuration.""" from local_deepwiki.config import FuzzySearchConfig config = FuzzySearchConfig() assert config.auto_fuzzy_threshold == 0.5 assert config.suggestion_threshold == 0.6 assert config.max_suggestions == 3 assert config.enable_auto_fuzzy is True def test_custom_config(self): """Test custom fuzzy search configuration.""" from local_deepwiki.config import FuzzySearchConfig config = FuzzySearchConfig( auto_fuzzy_threshold=0.7, suggestion_threshold=0.8, max_suggestions=5, enable_auto_fuzzy=False, ) assert config.auto_fuzzy_threshold == 0.7 assert config.suggestion_threshold == 0.8 assert config.max_suggestions == 5 assert config.enable_auto_fuzzy is False def test_config_validation(self): """Test fuzzy search config validation.""" from pydantic import ValidationError from local_deepwiki.config import FuzzySearchConfig # Invalid threshold (> 1.0) with pytest.raises(ValidationError): FuzzySearchConfig(auto_fuzzy_threshold=1.5) # Invalid threshold (< 0.0) with pytest.raises(ValidationError): FuzzySearchConfig(auto_fuzzy_threshold=-0.1) # Invalid max_suggestions (< 1) with pytest.raises(ValidationError): FuzzySearchConfig(max_suggestions=0)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/UrbanDiver/local-deepwiki-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_vectorstore.py•148 KiB