Code-Index-MCP

test_bm25_hybrid_search.py•13.8 KiB

#!/usr/bin/env python3 """ Test script for BM25 hybrid search functionality. This script tests: 1. BM25 indexer initialization and operations 2. Hybrid search with different configurations 3. Search result ranking and fusion 4. Performance and accuracy """ import shutil import tempfile from pathlib import Path import pytest from mcp_server.indexer.bm25_indexer import BM25Indexer from mcp_server.indexer.hybrid_search import HybridSearch, HybridSearchConfig from mcp_server.storage.sqlite_store import SQLiteStore from mcp_server.utils.fuzzy_indexer import FuzzyIndexer class TestBM25Indexer: """Test BM25 indexer functionality.""" def setup_method(self): """Set up test environment.""" self.test_dir = tempfile.mkdtemp() self.db_path = Path(self.test_dir) / "test_bm25.db" self.storage = SQLiteStore(str(self.db_path)) self.bm25_indexer = BM25Indexer(self.storage) # Create repository self.repo_id = self.storage.create_repository(self.test_dir, "test_repo") def teardown_method(self): """Clean up test environment.""" shutil.rmtree(self.test_dir) def test_initialization(self): """Test BM25 indexer initialization.""" assert self.bm25_indexer is not None assert self.bm25_indexer.table_name == "bm25_content" # Check that FTS5 tables were created with self.storage._get_connection() as conn: cursor = conn.execute( """ SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'bm25_%' """ ) tables = [row[0] for row in cursor] assert "bm25_content" in tables assert "bm25_symbols" in tables assert "bm25_documents" in tables def test_add_document(self): """Test adding documents to BM25 index.""" # Create a test file test_file = Path(self.test_dir) / "test.py" test_content = ''' def hello_world(): """Print hello world.""" print("Hello, World!") class Greeter: def greet(self, name): return f"Hello, {name}!" ''' test_file.write_text(test_content) # Store file in database file_id = self.storage.store_file( repository_id=self.repo_id, path=str(test_file), relative_path="test.py", language="python", ) # Index in BM25 metadata = {"language": "python", "symbols": ["hello_world", "Greeter", "greet"]} self.bm25_indexer.add_document(str(test_file), test_content, metadata) # Search for content results = self.bm25_indexer.search("hello world", limit=10) assert len(results) > 0 assert results[0]["filepath"] == str(test_file) assert "hello" in results[0]["snippet"].lower() def test_search_methods(self): """Test different search methods.""" # Add test documents docs = [ ("doc1.txt", "The quick brown fox jumps over the lazy dog"), ("doc2.txt", "A quick brown fox is a fast animal"), ("doc3.txt", "The lazy dog sleeps all day long"), ] for filename, content in docs: filepath = Path(self.test_dir) / filename filepath.write_text(content) file_id = self.storage.store_file( repository_id=self.repo_id, path=str(filepath), relative_path=filename, language="text", ) self.bm25_indexer.add_document(str(filepath), content, {"language": "text"}) # Test basic search results = self.bm25_indexer.search("fox", limit=10) assert len(results) == 2 assert all("fox" in r["snippet"] for r in results) # Test phrase search results = self.bm25_indexer.search_phrase("brown fox", limit=10) assert len(results) == 2 # Test prefix search results = self.bm25_indexer.search_prefix("qui", limit=10) assert len(results) == 2 # Test NEAR search results = self.bm25_indexer.search_near(["fox", "dog"], distance=10, limit=10) assert len(results) >= 1 def test_term_statistics(self): """Test term statistics functionality.""" # Add documents docs = [ ("doc1.txt", "Python is a programming language"), ("doc2.txt", "Python is popular for data science"), ("doc3.txt", "JavaScript is also a programming language"), ("doc4.txt", "Data science uses many programming languages"), ] for filename, content in docs: filepath = Path(self.test_dir) / filename filepath.write_text(content) file_id = self.storage.store_file( repository_id=self.repo_id, path=str(filepath), relative_path=filename, language="text", ) self.bm25_indexer.add_document(str(filepath), content, {"language": "text"}) # Get term statistics stats = self.bm25_indexer.get_term_statistics("python") assert stats["document_frequency"] == 2 assert stats["total_documents"] == 4 assert stats["percentage"] == 50.0 assert stats["idf"] > 0 stats = self.bm25_indexer.get_term_statistics("programming") assert stats["document_frequency"] == 3 assert stats["percentage"] == 75.0 class TestHybridSearch: """Test hybrid search functionality.""" def setup_method(self): """Set up test environment.""" self.test_dir = tempfile.mkdtemp() self.db_path = Path(self.test_dir) / "test_hybrid.db" self.storage = SQLiteStore(str(self.db_path)) self.repo_id = self.storage.create_repository(self.test_dir, "test_repo") # Initialize indexers self.bm25_indexer = BM25Indexer(self.storage) self.fuzzy_indexer = FuzzyIndexer(self.storage) # Initialize hybrid search self.config = HybridSearchConfig( bm25_weight=0.5, fuzzy_weight=0.5, semantic_weight=0.0, enable_bm25=True, enable_fuzzy=True, enable_semantic=False, ) self.hybrid_search = HybridSearch( storage=self.storage, bm25_indexer=self.bm25_indexer, fuzzy_indexer=self.fuzzy_indexer, semantic_indexer=None, config=self.config, ) def teardown_method(self): """Clean up test environment.""" shutil.rmtree(self.test_dir) @pytest.mark.asyncio async def test_hybrid_search_basic(self): """Test basic hybrid search functionality.""" # Add test documents docs = [ ( "python_code.py", ''' def calculate_sum(numbers): """Calculate the sum of a list of numbers.""" return sum(numbers) def calculate_average(numbers): """Calculate the average of a list of numbers.""" if not numbers: return 0 return sum(numbers) / len(numbers) ''', ), ( "math_utils.py", ''' import math def calculate_mean(values): """Calculate arithmetic mean.""" return sum(values) / len(values) def calculate_median(values): """Calculate median value.""" sorted_values = sorted(values) n = len(sorted_values) if n % 2 == 0: return (sorted_values[n//2 - 1] + sorted_values[n//2]) / 2 return sorted_values[n//2] ''', ), ] for filename, content in docs: filepath = Path(self.test_dir) / filename filepath.write_text(content) # Store in database file_id = self.storage.store_file( repository_id=self.repo_id, path=str(filepath), relative_path=filename, language="python", ) # Index in BM25 self.bm25_indexer.add_document(str(filepath), content, {"language": "python"}) # Index in fuzzy self.fuzzy_indexer.add_file(str(filepath), content) # Test hybrid search results = await self.hybrid_search.search("calculate average", limit=5) assert len(results) > 0 assert any("average" in r["snippet"].lower() for r in results) # Check that results come from multiple sources sources = set(r["source"] for r in results) assert len(sources) > 1 # Should have both BM25 and fuzzy results @pytest.mark.asyncio async def test_weight_configuration(self): """Test weight configuration in hybrid search.""" # Add a simple document filepath = Path(self.test_dir) / "test.txt" content = "This is a test document for weight configuration" filepath.write_text(content) file_id = self.storage.store_file( repository_id=self.repo_id, path=str(filepath), relative_path="test.txt", language="text", ) self.bm25_indexer.add_document(str(filepath), content, {"language": "text"}) self.fuzzy_indexer.add_file(str(filepath), content) # Test with equal weights self.hybrid_search.set_weights(bm25=0.5, fuzzy=0.5) results1 = await self.hybrid_search.search("test document", limit=1) score1 = results1[0]["score"] if results1 else 0 # Test with BM25 bias self.hybrid_search.set_weights(bm25=0.9, fuzzy=0.1) results2 = await self.hybrid_search.search("test document", limit=1) score2 = results2[0]["score"] if results2 else 0 # Scores should be different due to weight changes assert score1 != score2 @pytest.mark.asyncio async def test_method_toggling(self): """Test enabling/disabling search methods.""" # Add a test document filepath = Path(self.test_dir) / "test.txt" content = "Enable disable test document" filepath.write_text(content) file_id = self.storage.store_file( repository_id=self.repo_id, path=str(filepath), relative_path="test.txt", language="text", ) self.bm25_indexer.add_document(str(filepath), content, {"language": "text"}) self.fuzzy_indexer.add_file(str(filepath), content) # Test with both enabled self.hybrid_search.enable_methods(bm25=True, fuzzy=True) results_both = await self.hybrid_search.search("test", limit=10) # Test with only BM25 self.hybrid_search.enable_methods(bm25=True, fuzzy=False) results_bm25 = await self.hybrid_search.search("test", limit=10) # Test with only fuzzy self.hybrid_search.enable_methods(bm25=False, fuzzy=True) results_fuzzy = await self.hybrid_search.search("test", limit=10) # All should return results assert len(results_both) > 0 assert len(results_bm25) > 0 assert len(results_fuzzy) > 0 # Check sources assert all(r["source"] == "bm25" for r in results_bm25) assert all(r["source"] == "fuzzy" for r in results_fuzzy) @pytest.mark.asyncio async def test_cache_functionality(self): """Test hybrid search caching.""" # Add a test document filepath = Path(self.test_dir) / "cache_test.txt" content = "Cache test document content" filepath.write_text(content) file_id = self.storage.store_file( repository_id=self.repo_id, path=str(filepath), relative_path="cache_test.txt", language="text", ) self.bm25_indexer.add_document(str(filepath), content, {"language": "text"}) # Enable caching self.hybrid_search.config.cache_results = True # First search (cache miss) results1 = await self.hybrid_search.search("cache test", limit=10) stats1 = self.hybrid_search.get_statistics() # Second search (cache hit) results2 = await self.hybrid_search.search("cache test", limit=10) stats2 = self.hybrid_search.get_statistics() # Results should be identical assert len(results1) == len(results2) assert results1[0]["filepath"] == results2[0]["filepath"] # Cache hit rate should increase assert stats2["cache_hits"] > stats1.get("cache_hits", 0) # Clear cache self.hybrid_search.clear_cache() assert len(self.hybrid_search._result_cache) == 0 def test_bm25_optimization(): """Test BM25 index optimization.""" with tempfile.TemporaryDirectory() as tmpdir: db_path = Path(tmpdir) / "test_optimize.db" storage = SQLiteStore(str(db_path)) bm25_indexer = BM25Indexer(storage) repo_id = storage.create_repository(tmpdir, "test_repo") # Add many documents for i in range(50): filepath = Path(tmpdir) / f"doc{i}.txt" content = f"Document {i} with some content about topic {i % 10}" filepath.write_text(content) file_id = storage.store_file( repository_id=repo_id, path=str(filepath), relative_path=f"doc{i}.txt", language="text", ) bm25_indexer.add_document(str(filepath), content, {"language": "text"}) # Get statistics before optimization stats_before = bm25_indexer.get_statistics() # Optimize bm25_indexer.optimize() # Get statistics after optimization stats_after = bm25_indexer.get_statistics() # Document count should remain the same assert stats_before["total_documents"] == stats_after["total_documents"] # Search should still work results = bm25_indexer.search("topic 5", limit=10) assert len(results) > 0 if __name__ == "__main__": # Run tests pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_bm25_hybrid_search.py•13.8 KiB