Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

ragdocs-mcp
tests
integration

test_hybrid_search.py•21.3 KiB

""" Integration tests for Hybrid Search (D13). Tests the SearchOrchestrator's ability to combine results from multiple search strategies (semantic, keyword, graph) with recency boosting and RRF fusion. Uses real indices and async query methods. """ from datetime import datetime, timezone from pathlib import Path import pytest from src.config import Config, IndexingConfig, LLMConfig, SearchConfig, ServerConfig, ChunkingConfig from src.indexing.manager import IndexManager from src.indices.graph import GraphStore from src.indices.keyword import KeywordIndex from src.indices.vector import VectorIndex from src.models import ChunkResult from src.search.orchestrator import SearchOrchestrator def _doc_in_chunk_ids(doc_id: str, results: list[ChunkResult]): chunk_ids = [result.chunk_id for result in results] return any(chunk_id == doc_id or chunk_id.startswith(f"{doc_id}_chunk_") for chunk_id in chunk_ids) @pytest.fixture def config(tmp_path): """ Create test configuration with temporary paths. Uses tmp_path for isolated test storage to avoid conflicts. """ docs_path = tmp_path / "docs" docs_path.mkdir() return Config( server=ServerConfig(), indexing=IndexingConfig( documents_path=str(docs_path), index_path=str(tmp_path / "indices"), ), parsers={"**/*.md": "MarkdownParser"}, search=SearchConfig( semantic_weight=1.0, keyword_weight=1.0, recency_bias=0.5, rrf_k_constant=60, ), llm=LLMConfig(embedding_model="all-MiniLM-L6-v2"), document_chunking=ChunkingConfig(), memory_chunking=ChunkingConfig(), ) @pytest.fixture def indices(shared_embedding_model): """ Create real index instances. Returns tuple of (vector, keyword, graph) indices for SearchOrchestrator. """ vector = VectorIndex(embedding_model=shared_embedding_model) keyword = KeywordIndex() graph = GraphStore() return vector, keyword, graph @pytest.fixture def manager(config, indices): """ Create IndexManager with real indices. Provides fully functional manager for indexing test documents. """ vector, keyword, graph = indices return IndexManager(config, vector, keyword, graph) @pytest.fixture def orchestrator(config, indices, manager): """ Create SearchOrchestrator with real indices and configuration. Provides the hybrid search engine for query testing. """ vector, keyword, graph = indices return SearchOrchestrator(vector, keyword, graph, config, manager) def create_test_corpus(config, manager): """ Create a small test corpus (3-5 docs) for hybrid search testing. Documents are designed to test different search strategies: - doc1: Contains specific keyword "authentication" - doc2: Semantically related to security concepts - doc3: Links to doc1 (tests graph traversal) - doc4: Recent document (tests recency boost) - doc5: Contains both keyword and semantic matches """ docs_path = Path(config.indexing.documents_path) now = datetime.now(timezone.utc).timestamp() # Document 1: Keyword-rich document doc1 = docs_path / "authentication.md" doc1.write_text( "# Authentication System\n\n" "The authentication module handles user login and token validation." ) manager.index_document(str(doc1)) # Document 2: Semantically related to security doc2 = docs_path / "security.md" doc2.write_text( "# Security Overview\n\n" "Security measures include access control and authorization policies." ) manager.index_document(str(doc2)) # Document 3: Links to doc1 (graph connection) doc3 = docs_path / "api.md" doc3.write_text( "# API Documentation\n\n" "See [[authentication]] for details on API authentication." ) manager.index_document(str(doc3)) # Document 4: Recent document (set modified time to 3 days ago) doc4 = docs_path / "recent.md" doc4.write_text( "# Recent Updates\n\n" "Latest changes to the authorization system and access patterns." ) manager.index_document(str(doc4)) # Set modified time to 3 days ago for recency boost (1.2x) doc4.touch() doc4_mtime = now - (3 * 86400) # 3 days ago import os os.utime(str(doc4), (doc4_mtime, doc4_mtime)) # Document 5: Hybrid match (keyword + semantic) doc5 = docs_path / "auth_guide.md" doc5.write_text( "# Authentication Guide\n\n" "Complete guide to authentication, covering login flows and security best practices." ) manager.index_document(str(doc5)) return { "authentication": doc1, "security": doc2, "api": doc3, "recent": doc4, "auth_guide": doc5, } @pytest.mark.asyncio async def test_query_returns_results_from_multiple_strategies( config, manager, orchestrator ): """ Test that hybrid query returns results from semantic, keyword, and graph strategies. Validates that the orchestrator correctly dispatches queries to multiple search backends and aggregates their results using RRF fusion. """ # Create test corpus create_test_corpus(config, manager) # Query that should match multiple strategies results, _, _ = await orchestrator.query("authentication security", top_k=10, top_n=10) # Verify we got results assert len(results) > 0 # Verify scores are in valid range and properly ordered assert all(0.0 <= result.score <= 1.0 for result in results) scores = [r.score for r in results] assert scores == sorted(scores, reverse=True), "Results should be descending by score" # Verify results include docs from different strategies: # - "authentication" should match keyword search (exact term) # - "api" might appear via graph traversal (linked to authentication) # - "auth_guide" should match both keyword and semantic # At minimum, should include documents with explicit keyword matches assert _doc_in_chunk_ids("authentication", results) or _doc_in_chunk_ids("auth_guide", results) # Verify we got multiple results (showing fusion is working) assert len(results) >= 2 @pytest.mark.asyncio async def test_graph_neighbors_boost_related_docs(config, manager, orchestrator): """ Test that graph traversal boosts documents linked to query matches. When a document links to another via [[wikilinks]], the linked document should appear in results due to graph neighbor expansion. """ # Create test corpus create_test_corpus(config, manager) # Query specifically for "API Documentation" results, _, _ = await orchestrator.query("API Documentation", top_k=10, top_n=10) # Verify scores are valid and properly ordered assert all(0.0 <= result.score <= 1.0 for result in results) if len(results) > 1: scores = [r.score for r in results] assert scores == sorted(scores, reverse=True) # Extract chunk_ids for checking _ = [result.chunk_id for result in results] # "api" should match the query assert _doc_in_chunk_ids("api", results) # "authentication" should also appear because "api" links to it via [[authentication]] # The graph neighbor boost adds linked documents to the result pool # Either authentication appears directly (keyword/semantic) or via graph boost # Since api links to authentication, graph traversal should include it assert _doc_in_chunk_ids("authentication", results) or len(results) >= 2 @pytest.mark.asyncio async def test_recency_boosts_recent_docs(config, manager, orchestrator): """ Test that recently modified documents receive score boost. Documents modified within the last 7 days should receive a 1.2x multiplier, pushing them higher in the ranking than older documents with similar scores. """ # Create test corpus with recent doc create_test_corpus(config, manager) # Query that matches both recent and older docs results, _, _ = await orchestrator.query("authorization access", top_k=10, top_n=10) # Verify we got results assert len(results) > 0 # Verify scores are valid assert all(0.0 <= result.score <= 1.0 for result in results) if results: # Highest score should be high confidence assert results[0].score >= 0.5 # The "recent" document should appear in top results due to recency boost # It contains "authorization" and has a recent modification time (3 days ago) # This gives it a 1.2x boost vs older documents assert _doc_in_chunk_ids("recent", results) # Ideally, recent should rank highly due to recency boost # (exact ranking depends on other score factors, but it should be present) if len(results) >= 2: # If multiple results, recent should be boosted toward the top # Check if "recent" chunk exists in results before trying index chunk_ids = [result.chunk_id for result in results] recent_chunks = [r for r in chunk_ids if r.startswith("recent")] assert len(recent_chunks) > 0 @pytest.mark.asyncio async def test_empty_query_returns_empty_results(config, manager, orchestrator): """ Test that empty query returns no results. Validates graceful handling of edge case: empty or whitespace-only queries should return an empty list without errors. """ # Create test corpus create_test_corpus(config, manager) # Test empty query results_empty, _, _ = await orchestrator.query("", top_k=10, top_n=10) assert results_empty == [] # Test whitespace-only query results_whitespace, _, _ = await orchestrator.query(" \n\t ", top_k=10, top_n=10) assert results_whitespace == [] @pytest.mark.asyncio async def test_top_k_limits_results_correctly(config, manager, orchestrator): """ Test that top_k parameter correctly limits the number of results. Validates that the orchestrator respects the top_k limit when returning fused results, even when multiple strategies return many matches. """ # Create test corpus create_test_corpus(config, manager) # Query with very low top_k results_k1, _, _ = await orchestrator.query("authentication", top_k=1, top_n=1) assert len(results_k1) <= 1 if results_k1: assert all(0.0 <= result.score <= 1.0 for result in results_k1) # Query with top_k=2 results_k2, _, _ = await orchestrator.query("authentication", top_k=2, top_n=2) assert len(results_k2) <= 2 if results_k2: assert all(0.0 <= result.score <= 1.0 for result in results_k2) # Query with top_k=5 results_k5, _, _ = await orchestrator.query("authentication security", top_k=5, top_n=5) assert len(results_k5) <= 5 if results_k5: assert all(0.0 <= result.score <= 1.0 for result in results_k5) # Highest score should be high confidence assert results_k5[0].score >= 0.5 # Verify that increasing top_k returns more results (up to available docs) assert len(results_k1) <= len(results_k2) <= len(results_k5) @pytest.mark.asyncio async def test_weighted_strategies_affect_ranking(config, manager, orchestrator, tmp_path, shared_embedding_model): """ Test that strategy weights affect result ranking. When semantic_weight or keyword_weight is adjusted, the relative ranking of documents should change to favor results from higher-weighted strategies. """ # Create test corpus create_test_corpus(config, manager) # Baseline query with equal weights (1.0, 1.0) results_balanced, _, _ = await orchestrator.query("authentication", top_k=5, top_n=5) assert len(results_balanced) > 0 assert all(0.0 <= result.score <= 1.0 for result in results_balanced) # Create new config with higher semantic weight config_semantic_heavy = Config( server=ServerConfig(), indexing=IndexingConfig( documents_path=str(tmp_path / "docs2"), index_path=str(tmp_path / "indices2"), ), parsers=config.parsers, search=SearchConfig( semantic_weight=2.0, # Double semantic weight keyword_weight=1.0, recency_bias=0.5, rrf_k_constant=60, ), llm=config.llm, ) # Create new directory for second corpus docs_path2 = Path(config_semantic_heavy.indexing.documents_path) docs_path2.mkdir(parents=True, exist_ok=True) # Create fresh indices for semantic-heavy test vector_new = VectorIndex(embedding_model=shared_embedding_model) keyword_new = KeywordIndex() graph_new = GraphStore() manager_new = IndexManager(config_semantic_heavy, vector_new, keyword_new, graph_new) create_test_corpus(config_semantic_heavy, manager_new) orchestrator_semantic = SearchOrchestrator( vector_new, keyword_new, graph_new, config_semantic_heavy, manager_new ) # Query with semantic weight favored results_semantic_heavy, _, _ = await orchestrator_semantic.query("authentication", top_k=5, top_n=5) # Should still get results, but ranking may differ assert len(results_semantic_heavy) > 0 assert all(0.0 <= result.score <= 1.0 for result in results_semantic_heavy) # Verify different weight configurations produce results # (Exact ranking is complex, but we validate the system handles weights) chunk_ids = [result.chunk_id for result in results_semantic_heavy] result_set = set(chunk_ids) assert len(result_set) > 0 @pytest.mark.asyncio async def test_hybrid_search_integration_end_to_end(config, manager, orchestrator): """ Test complete hybrid search flow: index -> query -> fused results. End-to-end integration test validating that all components work together: multiple indices, parallel search dispatch, graph traversal, recency boost, RRF fusion, and top_k filtering. """ # Create test corpus docs = create_test_corpus(config, manager) # Verify all documents were indexed assert len(docs) == 5 # Complex query touching multiple aspects results, _, _ = await orchestrator.query( "authentication security access control", top_k=10, top_n=10 ) # Should return multiple results from different strategies assert len(results) >= 2 # Verify result types are correct assert all(isinstance(item, ChunkResult) for item in results) assert all(isinstance(result.chunk_id, str) and isinstance(result.score, float) for result in results) # Verify scores are valid and properly ordered assert all(0.0 <= result.score <= 1.0 for result in results) if len(results) > 1: scores = [r.score for r in results] assert scores == sorted(scores, reverse=True), "Results should be descending by score" # Extract chunk_ids for duplicate check chunk_ids = [result.chunk_id for result in results] # Verify no duplicate results assert len(chunk_ids) == len(set(chunk_ids)) # Verify results include expected documents - authentication should match assert _doc_in_chunk_ids("authentication", results) or _doc_in_chunk_ids("auth_guide", results) @pytest.mark.asyncio async def test_query_returns_normalized_scores(config, manager, orchestrator): """ Test that SearchOrchestrator returns properly normalized scores. Validates that the score normalization pipeline works correctly: - Scores are in [0, 1] range - Highest score is 1.0 - Scores are descending - Results are tuples of (chunk_id, score) """ create_test_corpus(config, manager) results, _, _ = await orchestrator.query("authentication", top_k=10, top_n=5) # Verify structure assert len(results) <= 5 assert all(isinstance(item, ChunkResult) for item in results) # Verify score properties if results: chunk_ids = [result.chunk_id for result in results] scores = [result.score for result in results] # All scores in [0, 1] assert all(0.0 <= score <= 1.0 for score in scores) # Highest score should be high confidence assert results[0].score >= 0.5 # Scores are descending for i in range(len(results) - 1): assert results[i].score >= results[i+1].score # All chunk_ids are strings assert all(isinstance(chunk_id, str) for chunk_id in chunk_ids) # All scores are floats assert all(isinstance(score, float) for score in scores) @pytest.mark.asyncio async def test_top_n_parameter_limits_results(config, manager, orchestrator): """ Test that top_n parameter correctly limits result count. Validates: - top_n=5 returns at most 5 results - top_n=3 returns at most 3 results - top_n=1 returns at most 1 result - Top N results are consistent across queries """ create_test_corpus(config, manager) results_5, _, _ = await orchestrator.query("authentication", top_k=10, top_n=5) results_3, _, _ = await orchestrator.query("authentication", top_k=10, top_n=3) results_1, _, _ = await orchestrator.query("authentication", top_k=10, top_n=1) # Verify limits are respected assert len(results_5) <= 5 assert len(results_3) <= 3 assert len(results_1) <= 1 # Verify scores are valid for all result sets for results in [results_5, results_3, results_1]: assert all(0.0 <= result.score <= 1.0 for result in results) if results: # Highest score should be high confidence assert results[0].score >= 0.5 # Top N results should match: results_5[:3] should equal results_3 if len(results_3) >= 3 and len(results_5) >= 3: assert results_5[:3] == results_3[:3] # Top 1 result should match across all queries if len(results_1) >= 1 and len(results_3) >= 1 and len(results_5) >= 1: assert results_1[0] == results_3[0] == results_5[0] @pytest.mark.asyncio async def test_normalized_scores_range_0_to_1(config, manager, orchestrator): """ Test that all normalized scores are strictly within [0, 1] range. Boundary validation test: ensures no score escapes normalization bounds regardless of query complexity or strategy weights. """ create_test_corpus(config, manager) # Test with various queries queries = [ "authentication security", "API documentation", "access control", "recent updates", "authorization", ] for query in queries: results, _, _ = await orchestrator.query(query, top_k=10, top_n=10) for result in results: # Verify types assert isinstance(result.chunk_id, str), f"chunk_id must be str, got {type(result.chunk_id)}" assert isinstance(result.score, float), f"score must be float, got {type(result.score)}" # Verify score bounds assert 0.0 <= result.score <= 1.0, f"Score {result.score} out of range [0, 1] for {result.chunk_id}" # If results exist, highest score should be high confidence if results: highest_score = results[0].score assert highest_score >= 0.3, f"Highest score should be >= 0.3, got {highest_score}" # If multiple results, lowest score must be >= 0.0 if len(results) > 1: lowest_score = results[-1].score assert lowest_score >= 0.0, f"Lowest score should be >= 0.0, got {lowest_score}" @pytest.mark.asyncio async def test_top_n_greater_than_10_works_correctly(config, manager, orchestrator): """ Test that top_n > 10 works correctly with dynamic top_k calculation. This test validates the fix for the issue where top_k was hardcoded to 10, which limited results even when top_n was set higher. With the dynamic calculation (top_k = max(10, top_n * 2)), requesting top_n=25 should retrieve sufficient candidates to return close to 25 results. Validates: - top_n=25 can return up to 25 results (corpus size permitting) - Dynamic top_k calculation provides sufficient candidates - Results maintain proper score normalization """ create_test_corpus(config, manager) # Request 25 results with dynamic top_k calculation results_25, _, _ = await orchestrator.query("authentication security API", top_k=50, top_n=25) # Should return results (exact count depends on corpus size) # Since our test corpus has 5 documents with multiple chunks each, # we should get multiple results, though likely < 25 due to corpus size assert len(results_25) > 0, "Should return at least some results" # Verify all results have valid scores assert all(0.0 <= result.score <= 1.0 for result in results_25) # Verify highest score is high confidence if results_25: assert results_25[0].score >= 0.5 # Verify results are properly sorted descending for i in range(len(results_25) - 1): assert results_25[i].score >= results_25[i+1].score # Verify that top_n=25 can return more results than top_n=10 # (if corpus is large enough) results_10, _, _ = await orchestrator.query("authentication security API", top_k=20, top_n=10) # The key validation: with sufficient corpus, top_n=25 should not be # artificially limited to 10 results by a hardcoded top_k # At minimum, both queries should work without error and return valid results assert len(results_10) <= 10 assert len(results_25) <= 25 # If we got 10 results for the first query, and our corpus is larger, # we should be able to get more for top_n=25 # (Though in small test corpus, this might not always be true)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_hybrid_search.py•21.3 KiB