semantic-code-mcp

test_hybrid_search.py•5 KiB

"""Tests for hybrid search (vector + full-text).""" import pytest from semantic_code_mcp.models import Chunk, ChunkType, ChunkWithEmbedding from semantic_code_mcp.storage.lancedb import LanceDBConnection, LanceDBVectorStore @pytest.fixture def store_with_chunks(lance_connection: LanceDBConnection) -> LanceDBVectorStore: """Create a store with sample chunks for testing hybrid search.""" store = LanceDBVectorStore(lance_connection) # Create embeddings that are actually different (not parallel) # Using varied values to get different cosine similarities emb_low = [0.1 if i % 2 == 0 else -0.1 for i in range(384)] emb_high = [0.9 if i % 2 == 0 else 0.8 for i in range(384)] emb_mid = [0.5 if i % 3 == 0 else 0.3 for i in range(384)] chunks = [ # Chunk with "duration_ms" - should be found by keyword search ChunkWithEmbedding( chunk=Chunk( file_path="/project/indexer.py", line_start=100, line_end=120, content='log.debug("timing", duration_ms=round(time.time() - t0))', chunk_type=ChunkType.METHOD, name="index", ), embedding=emb_low, # Low similarity to query ), # Chunk semantically about timing but no "duration_ms" keyword ChunkWithEmbedding( chunk=Chunk( file_path="/project/utils.py", line_start=1, line_end=10, content="def measure_elapsed_time(): return time.perf_counter()", chunk_type=ChunkType.FUNCTION, name="measure_elapsed_time", ), embedding=emb_high, # High similarity to query ), # Chunk with "log.debug" but different topic ChunkWithEmbedding( chunk=Chunk( file_path="/project/cache.py", line_start=50, line_end=60, content='log.debug("cache_saved", files_count=len(files))', chunk_type=ChunkType.METHOD, name="_save", ), embedding=emb_mid, ), ] store.add_chunks(chunks) return store class TestHybridSearch: """Tests for hybrid search functionality.""" def test_vector_only_search_misses_keyword_match(self, store_with_chunks: LanceDBVectorStore): """Pure vector search may miss results with exact keyword matches.""" # Query embedding similar to "measure_elapsed_time" chunk (emb_high pattern) query_embedding = [0.9 if i % 2 == 0 else 0.8 for i in range(384)] results = store_with_chunks.search(query_embedding, limit=2) # Vector search finds semantically similar, not keyword matches assert len(results) > 0 # The "duration_ms" chunk has low embedding similarity, may not appear names = [r.name for r in results] assert "measure_elapsed_time" in names def test_fts_search_finds_exact_keyword(self, store_with_chunks: LanceDBVectorStore): """Full-text search finds chunks containing exact keywords.""" results = store_with_chunks.search_fts("duration_ms", limit=5) assert len(results) >= 1 assert any("duration_ms" in r.content for r in results) def test_fts_search_returns_empty_for_no_match(self, store_with_chunks: LanceDBVectorStore): """FTS returns empty list when no matches found.""" results = store_with_chunks.search_fts("nonexistent_keyword_xyz", limit=5) assert results == [] def test_hybrid_search_combines_both(self, store_with_chunks: LanceDBVectorStore): """Hybrid search finds both semantic and keyword matches.""" query_embedding = [ 0.9 if i % 2 == 0 else 0.8 for i in range(384) ] # Similar to measure_elapsed_time results = store_with_chunks.search_hybrid( query_embedding=query_embedding, query_text="duration_ms", limit=5, ) # Should find the keyword match in results assert "duration_ms" in " ".join(r.content for r in results) def test_hybrid_search_weight_adjustable(self, store_with_chunks: LanceDBVectorStore): """Can adjust weight between vector and FTS search.""" query_embedding = [0.9 if i % 2 == 0 else 0.8 for i in range(384)] # Heavy FTS weight should prioritize keyword matches results_fts_heavy = store_with_chunks.search_hybrid( query_embedding=query_embedding, query_text="duration_ms", limit=5, vector_weight=0.3, # 30% vector, 70% FTS ) # Heavy vector weight should prioritize semantic matches results_vec_heavy = store_with_chunks.search_hybrid( query_embedding=query_embedding, query_text="duration_ms", limit=5, vector_weight=0.9, # 90% vector, 10% FTS ) # Both should return results, but ordering may differ assert len(results_fts_heavy) > 0 assert len(results_vec_heavy) > 0

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vrppaul/semantic-code-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_hybrid_search.py•5 KiB