ChunkHound

Overview Schema Related Servers Score Discussions

chunkhound
tests
integration

test_lancedb_embeddings.py•18 KiB

"""Integration tests for LanceDB embedding storage. Tests that embeddings are properly generated and stored when using the LanceDB backend, addressing the bug where .search() failed to find chunks with NULL embedding columns. """ import asyncio from pathlib import Path from unittest.mock import AsyncMock, MagicMock import pytest def test_lancedb_embeddings_stored_during_indexing(lancedb_provider, tmp_path): """Verify embeddings are stored in LanceDB during indexing with mock provider.""" from chunkhound.core.models import Chunk from chunkhound.core.types.common import ChunkType, Language # Need to insert a file first for foreign key from chunkhound.core.models import File test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert test chunks chunks = [ Chunk( file_id=file_id, code="def hello(): pass", start_line=1, end_line=1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="hello", ), Chunk( file_id=file_id, code="def world(): return 42", start_line=2, end_line=2, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="world", ), ] # Insert chunks (returns chunk IDs) chunk_ids = lancedb_provider.insert_chunks_batch(chunks) assert len(chunk_ids) == 2, "Should insert 2 chunks" # Verify chunks exist stats = lancedb_provider.get_stats() assert stats["chunks"] == 2, "Should have 2 chunks in database" # Create mock embeddings data embedding_dim = 8 # Small dimension for testing embeddings_data = [ { "chunk_id": chunk_ids[0], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.1] * embedding_dim, }, { "chunk_id": chunk_ids[1], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.2] * embedding_dim, }, ] # Store embeddings stored_count = lancedb_provider.insert_embeddings_batch(embeddings_data) assert stored_count == 2, f"Should store 2 embeddings, got {stored_count}" # Verify embeddings are retrievable for i, chunk_id in enumerate(chunk_ids): embedding = lancedb_provider.get_embedding_by_chunk_id(chunk_id, "test", "test-model") assert embedding is not None, f"Embedding for chunk {chunk_id} should exist" assert embedding.vector is not None, f"Embedding vector should not be None" assert len(embedding.vector) == embedding_dim, f"Embedding should have {embedding_dim} dimensions" def test_lancedb_indexing_flow_creates_chunks(lancedb_provider, tmp_path): """Test that LanceDB indexing flow creates chunks correctly. Note: This test verifies chunk creation only. For embedding storage tests, see test_lancedb_embeddings_stored_during_indexing and test_lancedb_embedding_update_finds_chunks which test the embedding insertion path directly without needing to inject mock providers. """ from chunkhound.services.indexing_coordinator import IndexingCoordinator # Create a test Python file test_file = tmp_path / "test_module.py" test_file.write_text(""" def greet(name): return f"Hello, {name}!" def add(a, b): return a + b class Calculator: def multiply(self, x, y): return x * y """) # Create coordinator without embedding provider (skip embeddings) coord = IndexingCoordinator( database_provider=lancedb_provider, base_directory=tmp_path, embedding_provider=None, # No embeddings - just test chunk creation ) # Process the file result = asyncio.run( coord.process_directory(tmp_path, patterns=["**/*.py"], exclude_patterns=[]) ) assert result["files_processed"] == 1, "Should process 1 file" # Verify chunks were created stats = lancedb_provider.get_stats() assert stats["chunks"] > 0, "Should have chunks in database" # The embedding tests (test_lancedb_embedding_update_finds_chunks) verify # that the fix for NULL embedding column search works correctly def test_lancedb_embedding_update_finds_chunks(lancedb_provider, tmp_path): """Test that embedding update correctly finds chunks with NULL embeddings. This specifically tests the fix for the bug where .search() failed to find chunks with NULL embedding columns. """ from chunkhound.core.models import Chunk from chunkhound.core.types.common import ChunkType, Language # Need to insert a file first for foreign key from chunkhound.core.models import File test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert chunks with NULL embeddings (default state) chunks = [] for i in range(10): chunks.append( Chunk( file_id=file_id, code=f"def func_{i}(): return {i}", start_line=i + 1, # Line numbers must be positive (start at 1) end_line=i + 1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol=f"func_{i}", ) ) chunk_ids = lancedb_provider.insert_chunks_batch(chunks) assert len(chunk_ids) == 10, "Should insert 10 chunks" # Verify chunks were inserted with NULL embeddings stats_before = lancedb_provider.get_stats() assert stats_before["chunks"] == 10 # Now try to add embeddings (this is where the bug manifested) embedding_dim = 16 embeddings_data = [ { "chunk_id": cid, "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [float(i) / 100] * embedding_dim, } for i, cid in enumerate(chunk_ids) ] # This should succeed with the fix stored_count = lancedb_provider.insert_embeddings_batch(embeddings_data) # THE KEY ASSERTION: All embeddings should be stored assert stored_count == 10, ( f"Should store all 10 embeddings, but only stored {stored_count}. " "This indicates the chunk lookup in insert_embeddings_batch failed." ) # Verify each embedding is retrievable for chunk_id in chunk_ids: emb = lancedb_provider.get_embedding_by_chunk_id(chunk_id, "test", "test-model") assert emb is not None, f"Embedding for chunk {chunk_id} should be retrievable" assert emb.vector is not None, f"Embedding vector should not be None" def test_lancedb_find_similar_chunks_basic(lancedb_provider, tmp_path): """Test that find_similar_chunks returns similar chunks ranked by score.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language # Insert test file test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert test chunks chunks = [ Chunk( file_id=file_id, code="def authenticate_user(username, password): pass", start_line=1, end_line=1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="authenticate_user", ), Chunk( file_id=file_id, code="def login_user(credentials): pass", start_line=2, end_line=2, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="login_user", ), Chunk( file_id=file_id, code="def calculate_taxes(income): pass", start_line=3, end_line=3, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="calculate_taxes", ), ] chunk_ids = lancedb_provider.insert_chunks_batch(chunks) # Add embeddings with different similarity levels # chunk 0 and 1 are similar (auth-related), chunk 2 is different embedding_dim = 8 embeddings_data = [ { "chunk_id": chunk_ids[0], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.9, 0.8, 0.7, 0.6, 0.1, 0.1, 0.1, 0.1], # Auth-related }, { "chunk_id": chunk_ids[1], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.8, 0.9, 0.6, 0.7, 0.1, 0.1, 0.1, 0.1], # Similar to chunk 0 }, { "chunk_id": chunk_ids[2], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.1, 0.1, 0.1, 0.1, 0.9, 0.8, 0.7, 0.6], # Different domain }, ] lancedb_provider.insert_embeddings_batch(embeddings_data) # Find similar chunks to chunk 0 similar = lancedb_provider.find_similar_chunks( chunk_id=chunk_ids[0], provider="test", model="test-model", limit=10, ) # Should return chunks 1 and 2 (excluding source chunk 0) assert len(similar) == 2, f"Should find 2 similar chunks, got {len(similar)}" # Results should be ranked by similarity score assert similar[0]["chunk_id"] == chunk_ids[1], "Most similar should be chunk 1" assert similar[1]["chunk_id"] == chunk_ids[2], "Least similar should be chunk 2" # Scores should be in descending order assert similar[0]["score"] > similar[1]["score"], "Scores should be ranked" def test_lancedb_find_similar_chunks_excludes_source(lancedb_provider, tmp_path): """Test that source chunk is excluded from results.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language # Insert test file test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert chunks chunks = [ Chunk( file_id=file_id, code=f"def func_{i}(): pass", start_line=i + 1, end_line=i + 1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol=f"func_{i}", ) for i in range(3) ] chunk_ids = lancedb_provider.insert_chunks_batch(chunks) # Add identical embeddings (all should be equally similar) embedding_dim = 8 embeddings_data = [ { "chunk_id": cid, "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.5] * embedding_dim, } for cid in chunk_ids ] lancedb_provider.insert_embeddings_batch(embeddings_data) # Find similar chunks to chunk 0 similar = lancedb_provider.find_similar_chunks( chunk_id=chunk_ids[0], provider="test", model="test-model", limit=10, ) # Should return 2 chunks (excluding source) assert len(similar) == 2, "Should exclude source chunk" # Source chunk should NOT be in results result_ids = [r["chunk_id"] for r in similar] assert chunk_ids[0] not in result_ids, "Source chunk should be excluded" assert chunk_ids[1] in result_ids, "Chunk 1 should be included" assert chunk_ids[2] in result_ids, "Chunk 2 should be included" def test_lancedb_find_similar_chunks_no_embedding(lancedb_provider, tmp_path): """Test that empty list is returned when chunk has no embedding.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language # Insert test file test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert chunk WITHOUT embedding chunks = [ Chunk( file_id=file_id, code="def test(): pass", start_line=1, end_line=1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="test", ) ] chunk_ids = lancedb_provider.insert_chunks_batch(chunks) # Try to find similar chunks (should return empty list) similar = lancedb_provider.find_similar_chunks( chunk_id=chunk_ids[0], provider="test", model="test-model", limit=10, ) assert similar == [], "Should return empty list when no embedding exists" def test_lancedb_find_similar_chunks_threshold(lancedb_provider, tmp_path): """Test that similarity threshold filters results correctly.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language # Insert test file test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert chunks chunks = [ Chunk( file_id=file_id, code=f"def func_{i}(): pass", start_line=i + 1, end_line=i + 1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol=f"func_{i}", ) for i in range(3) ] chunk_ids = lancedb_provider.insert_chunks_batch(chunks) # Add embeddings with different similarities embedding_dim = 8 embeddings_data = [ { "chunk_id": chunk_ids[0], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], }, { "chunk_id": chunk_ids[1], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.9, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], # Very similar }, { "chunk_id": chunk_ids[2], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0], # Not similar }, ] lancedb_provider.insert_embeddings_batch(embeddings_data) # Find with high threshold (should filter out dissimilar chunks) similar = lancedb_provider.find_similar_chunks( chunk_id=chunk_ids[0], provider="test", model="test-model", limit=10, threshold=0.8, # High threshold - only very similar chunks ) # Should only return highly similar chunk 1 assert len(similar) >= 1, "Should find at least one similar chunk" assert all(r["score"] >= 0.8 for r in similar), "All results should meet threshold" def test_lancedb_find_similar_chunks_limit(lancedb_provider, tmp_path): """Test that result limit is respected.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language # Insert test file test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert many chunks chunks = [ Chunk( file_id=file_id, code=f"def func_{i}(): pass", start_line=i + 1, end_line=i + 1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol=f"func_{i}", ) for i in range(10) ] chunk_ids = lancedb_provider.insert_chunks_batch(chunks) # Add similar embeddings to all embedding_dim = 8 embeddings_data = [ { "chunk_id": cid, "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.5 + i * 0.01 for i in range(embedding_dim)], } for cid in chunk_ids ] lancedb_provider.insert_embeddings_batch(embeddings_data) # Find with small limit limit = 3 similar = lancedb_provider.find_similar_chunks( chunk_id=chunk_ids[0], provider="test", model="test-model", limit=limit, ) assert len(similar) == limit, f"Should respect limit of {limit}" def test_lancedb_find_similar_chunks_wrong_provider(lancedb_provider, tmp_path): """Test that empty list is returned for non-existent provider/model.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language # Insert test file test_file = File( path="test.py", mtime=1234567890.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert chunk with embedding chunks = [ Chunk( file_id=file_id, code="def test(): pass", start_line=1, end_line=1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="test", ) ] chunk_ids = lancedb_provider.insert_chunks_batch(chunks) # Add embedding with specific provider/model embedding_dim = 8 embeddings_data = [ { "chunk_id": chunk_ids[0], "provider": "test", "model": "test-model", "dims": embedding_dim, "embedding": [0.5] * embedding_dim, } ] lancedb_provider.insert_embeddings_batch(embeddings_data) # Try to find with wrong provider/model similar = lancedb_provider.find_similar_chunks( chunk_id=chunk_ids[0], provider="wrong-provider", model="wrong-model", limit=10, ) assert similar == [], "Should return empty list for non-existent provider/model"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ofriw/chunkhound'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_lancedb_embeddings.py•18 KiB