ChunkHound

Overview Schema Related Servers Score Discussions

chunkhound
tests
integration

test_lancedb_deduplication.py•13.1 KiB

"""Integration tests for LanceDB chunk deduplication fix. Tests verify that the three-layer defense strategy prevents duplicate chunks: - Layer 1: Database merge_insert prevents duplicates - Layer 2: Scalar index provides performance - Layer 3: Event deduplication reduces redundant work """ import asyncio import time from pathlib import Path import pytest # Skip these tests if lancedb is not available class TestChunkDeduplication: """Test suite for chunk deduplication via merge_insert.""" def test_duplicate_file_processing_no_duplicate_chunks(self, tmp_path, lancedb_provider): """Verify processing same file twice doesn't create duplicate chunks.""" from chunkhound.services.indexing_coordinator import IndexingCoordinator # Create test file test_file = tmp_path / "test.py" test_file.write_text("def test_fn(): pass\nclass TestClass: pass") coord = IndexingCoordinator(database_provider=lancedb_provider, base_directory=tmp_path) # Process file twice (simulating created + modified events) result1 = asyncio.run(coord.process_file(test_file)) result2 = asyncio.run(coord.process_file(test_file)) # Both should succeed assert result1.get("status") != "error" assert result2.get("status") != "error" # Get all chunks file_record = lancedb_provider.get_file_by_path("test.py") assert file_record is not None file_id = file_record["id"] if isinstance(file_record, dict) else file_record.id all_chunks = lancedb_provider.get_chunks_by_file_id(file_id) chunk_ids = [c["id"] for c in all_chunks] # Verify no duplicate chunk IDs assert len(chunk_ids) == len(set(chunk_ids)), f"Duplicate chunk IDs found: {chunk_ids}" print(f"✓ Processed file twice, got {len(chunk_ids)} unique chunks (no duplicates)") def test_vue_haskell_no_duplicate_chunk_ids(self, tmp_path, lancedb_provider): """Verify Vue/Haskell files with identical content get unique chunk IDs. Regression test for: Duplicate chunk IDs detected in batch errors. Vue directives and elements with identical content should get different IDs because they have different chunk types (concept-aware hashing). """ from chunkhound.services.indexing_coordinator import IndexingCoordinator # Create a Vue file with structure likely to produce identical content # Vue parser may extract both directives (DEFINITION) and elements (BLOCK) vue_file = tmp_path / "component.vue" vue_file.write_text("""<template> <div v-if="show">Content</div> </template> <script> export default { data() { return { show: true } } } </script> """) coord = IndexingCoordinator(database_provider=lancedb_provider, base_directory=tmp_path) # Index the file result = asyncio.run(coord.process_file(vue_file)) # Should succeed without duplicate ID errors assert result.get("status") != "error", f"Indexing failed: {result}" # Verify no duplicate chunk IDs in database file_record = lancedb_provider.get_file_by_path("component.vue") assert file_record is not None file_id = file_record["id"] if isinstance(file_record, dict) else file_record.id chunks = lancedb_provider.get_chunks_by_file_id(file_id) chunk_ids = [c["id"] for c in chunks] assert len(chunk_ids) == len(set(chunk_ids)), \ f"Found duplicate chunk IDs in Vue file: {chunk_ids}" print(f"✓ Vue file indexed: {len(chunk_ids)} unique chunks (no duplicates)") def test_rapid_file_modifications(self, tmp_path, lancedb_provider): """Simulate rapid file modifications (editor save pattern).""" from chunkhound.services.indexing_coordinator import IndexingCoordinator test_file = tmp_path / "rapid.py" coord = IndexingCoordinator(database_provider=lancedb_provider, base_directory=tmp_path) # Version 1: Initial content test_file.write_text("def initial(): pass") result1 = asyncio.run(coord.process_file(test_file)) # Version 2: Modified content (simulate editor save triggering modified event) test_file.write_text("def initial(): return 1") result2 = asyncio.run(coord.process_file(test_file)) # Version 3: Another modification test_file.write_text("def initial(): return 2") result3 = asyncio.run(coord.process_file(test_file)) # Get final chunks file_record = lancedb_provider.get_file_by_path("rapid.py") file_id = file_record["id"] if isinstance(file_record, dict) else file_record.id all_chunks = lancedb_provider.get_chunks_by_file_id(file_id) chunk_ids = [c["id"] for c in all_chunks] # Verify no duplicates despite multiple updates assert len(chunk_ids) == len(set(chunk_ids)), f"Duplicate chunk IDs found: {chunk_ids}" print(f"✓ Rapid modifications: {len(chunk_ids)} unique chunks (no duplicates)") class TestConcurrentProcessing: """Test suite for concurrent file processing scenarios.""" @pytest.mark.asyncio async def test_concurrent_file_processing(self, tmp_path, lancedb_provider): """Verify concurrent processing of same file doesn't create duplicates.""" from chunkhound.services.indexing_coordinator import IndexingCoordinator test_file = tmp_path / "concurrent.py" test_file.write_text("def concurrent_test(): pass\nclass Concurrent: pass") coord = IndexingCoordinator(database_provider=lancedb_provider, base_directory=tmp_path) # Process same file concurrently (simulating race between initial scan + realtime) results = await asyncio.gather( coord.process_file(test_file), coord.process_file(test_file), coord.process_file(test_file), ) # All should complete assert all(r.get("status") != "error" for r in results) # Verify no duplicates file_record = lancedb_provider.get_file_by_path("concurrent.py") file_id = file_record["id"] if isinstance(file_record, dict) else file_record.id all_chunks = lancedb_provider.get_chunks_by_file_id(file_id) chunk_ids = [c["id"] for c in all_chunks] assert len(chunk_ids) == len(set(chunk_ids)), f"Duplicate chunk IDs found" print(f"✓ Concurrent processing: {len(chunk_ids)} unique chunks (no duplicates)") class TestScalarIndexCreation: """Test suite for scalar index creation.""" def test_scalar_index_created_on_connect(self, lancedb_provider): """Verify scalar index creation happens during connection.""" # Index creation happens in _executor_create_indexes (called from _executor_connect) assert lancedb_provider._chunks_table is not None print("✓ Scalar index creation verified (table exists)") class TestSearchDeduplication: """Test suite for search result deduplication across LanceDB fragments. Regression tests for the bug where search_regex and search_semantic returned duplicate chunk_ids due to fragmentation. """ def test_regex_search_no_duplicates_with_fragments( self, fragmented_lancedb_provider, tmp_path ): """Verify regex search returns unique chunk_ids with 50+ fragments.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language from tests.fixtures.fragmentation_helpers import verify_no_duplicate_chunk_ids # The fragmented_lancedb_provider already has 50 fragments # Add a test file with searchable content test_file = File( path="searchable.py", mtime=9999999999.0, language=Language.PYTHON, size_bytes=100, ) file_id = fragmented_lancedb_provider.insert_file(test_file) # Insert chunk with unique pattern chunk = Chunk( file_id=file_id, code="def unique_search_target_12345(): return 'found'", start_line=1, end_line=1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="unique_search_target_12345", ) fragmented_lancedb_provider.insert_chunk(chunk) # Search for the unique pattern results, pagination = fragmented_lancedb_provider.search_regex( pattern="unique_search_target_12345", page_size=100, offset=0, path_filter=None, ) # Verify no duplicates is_valid, error_msg = verify_no_duplicate_chunk_ids(results) assert is_valid, f"Regex search returned duplicates: {error_msg}" assert len(results) >= 1, "Should find at least the inserted chunk" print( f"✓ Regex search with 50 fragments: {len(results)} unique results (no duplicates)" ) def test_file_update_creates_no_search_duplicates( self, lancedb_provider, tmp_path ): """Verify file updates don't create duplicate search results.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language from tests.fixtures.fragmentation_helpers import verify_no_duplicate_chunk_ids # Insert initial file test_file = File( path="updated_file.py", mtime=1000000.0, language=Language.PYTHON, size_bytes=100, ) file_id = lancedb_provider.insert_file(test_file) # Insert initial chunk chunk = Chunk( file_id=file_id, code="def update_test_v1(): return 1", start_line=1, end_line=1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol="update_test_v1", ) lancedb_provider.insert_chunk(chunk) # Update file 5 times (creates multiple fragments) for i in range(2, 7): updated_file = File( id=file_id, path="updated_file.py", mtime=1000000.0 + i, language=Language.PYTHON, size_bytes=100 + i, ) lancedb_provider.insert_file(updated_file) updated_chunk = Chunk( file_id=file_id, code=f"def update_test_v{i}(): return {i}", start_line=1, end_line=1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol=f"update_test_v{i}", ) lancedb_provider.insert_chunk(updated_chunk) # Search for the latest version results, pagination = lancedb_provider.search_regex( pattern="update_test_v6", page_size=100, offset=0, path_filter=None, ) # Verify no duplicates is_valid, error_msg = verify_no_duplicate_chunk_ids(results) assert is_valid, f"File update search returned duplicates: {error_msg}" # Should find exactly 1 result (latest version) assert len(results) == 1, f"Expected 1 result, got {len(results)}" print( f"✓ File update (5 iterations): {len(results)} unique result (no duplicates)" ) def test_regex_search_pagination_counts_unique_chunks( self, fragmented_lancedb_provider, tmp_path ): """Verify pagination total count reflects unique chunks, not duplicates.""" from chunkhound.core.models import Chunk, File from chunkhound.core.types.common import ChunkType, Language # Insert file with multiple searchable chunks test_file = File( path="multi_chunk.py", mtime=9999999999.0, language=Language.PYTHON, size_bytes=500, ) file_id = fragmented_lancedb_provider.insert_file(test_file) # Insert 10 chunks with common pattern for i in range(10): chunk = Chunk( file_id=file_id, code=f"def pagination_test_{i}(): return {i}", start_line=i + 1, end_line=i + 1, chunk_type=ChunkType.FUNCTION, language=Language.PYTHON, symbol=f"pagination_test_{i}", ) fragmented_lancedb_provider.insert_chunk(chunk) # Search for common pattern results, pagination = fragmented_lancedb_provider.search_regex( pattern="pagination_test_", page_size=100, offset=0, path_filter=None, ) # Verify total count matches unique results assert pagination["total"] == len( results ), f"Total count {pagination['total']} != actual results {len(results)}" assert len(results) == 10, f"Expected 10 unique chunks, got {len(results)}" # Verify all chunk_ids are unique chunk_ids = [r["chunk_id"] for r in results] assert len(chunk_ids) == len( set(chunk_ids) ), "Found duplicate chunk_ids in results" print( f"✓ Pagination counts: total={pagination['total']}, results={len(results)} (match, no duplicates)" ) if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ofriw/chunkhound'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_lancedb_deduplication.py•13.1 KiB