Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

ragdocs-mcp
tests
integration

test_calibration_scenarios.py•9.82 KiB

""" Integration tests for realistic calibration scenarios. Tests that sigmoid calibration provides meaningful confidence scores for different query quality levels and validates min_score filtering. """ from pathlib import Path import pytest from src.config import Config, IndexingConfig, LLMConfig, SearchConfig, ServerConfig, ChunkingConfig from src.indexing.manager import IndexManager from src.indices.graph import GraphStore from src.indices.keyword import KeywordIndex from src.indices.vector import VectorIndex from src.search.orchestrator import SearchOrchestrator @pytest.fixture def config(tmp_path): """Create test configuration with temporary paths.""" docs_path = tmp_path / "docs" docs_path.mkdir() return Config( server=ServerConfig(), indexing=IndexingConfig( documents_path=str(docs_path), index_path=str(tmp_path / "indices"), ), parsers={"**/*.md": "MarkdownParser"}, search=SearchConfig( semantic_weight=1.0, keyword_weight=1.0, recency_bias=0.5, rrf_k_constant=60, ), llm=LLMConfig(embedding_model="all-MiniLM-L6-v2"), document_chunking=ChunkingConfig(), memory_chunking=ChunkingConfig(), ) @pytest.fixture def indices(): """Create real index instances.""" vector = VectorIndex() keyword = KeywordIndex() graph = GraphStore() return vector, keyword, graph @pytest.fixture def manager(config, indices): """Create IndexManager with real indices.""" vector, keyword, graph = indices return IndexManager(config, vector, keyword, graph) @pytest.fixture def orchestrator(config, indices, manager): """Create SearchOrchestrator with real indices and configuration.""" vector, keyword, graph = indices return SearchOrchestrator(vector, keyword, graph, config, manager) def create_test_corpus(config, manager): """ Create a test corpus for calibration testing. Documents designed to test different match qualities: - Technical API docs (precise terminology) - General guides (broader concepts) - Unrelated content (low relevance) """ docs_path = Path(config.indexing.documents_path) # High-quality match target doc1 = docs_path / "api_reference.md" doc1.write_text( "# API Reference\n\n" "Complete authentication API documentation including login, " "token validation, and session management endpoints." ) manager.index_document(str(doc1)) # Good match target doc2 = docs_path / "auth_guide.md" doc2.write_text( "# Authentication Guide\n\n" "User authentication involves verifying credentials and " "establishing secure sessions. See security best practices." ) manager.index_document(str(doc2)) # Moderate match target doc3 = docs_path / "security_overview.md" doc3.write_text( "# Security Overview\n\n" "Security features include access control, encryption, " "and monitoring. All systems require proper authentication." ) manager.index_document(str(doc3)) # Weak match target doc4 = docs_path / "deployment.md" doc4.write_text( "# Deployment Guide\n\n" "Deployment procedures for production environments. " "Configure servers, databases, and networking." ) manager.index_document(str(doc4)) # Unrelated content doc5 = docs_path / "recipes.md" doc5.write_text( "# Cooking Recipes\n\n" "Collection of favorite recipes including pasta, " "bread, and desserts." ) manager.index_document(str(doc5)) return { "api_reference": doc1, "auth_guide": doc2, "security_overview": doc3, "deployment": doc4, "recipes": doc5, } @pytest.mark.asyncio async def test_good_match_returns_high_confidence(config, manager, orchestrator): """ Test that a good query match returns high confidence scores (>0.7). Validates that precise queries for relevant content produce scores indicating strong matches. """ create_test_corpus(config, manager) # Precise query matching doc1 (api_reference.md) results, _, _ = await orchestrator.query( "authentication API reference documentation", top_k=10, top_n=10 ) assert len(results) > 0 # Top result should have high confidence (>0.7) top_score = results[0].score assert top_score > 0.7, f"Good match should have confidence >0.7, got {top_score}" # Best result should ideally be very high confidence (>0.95) if results[0].chunk_id.startswith("api_reference"): assert top_score > 0.95, "Excellent match should have confidence >0.95" @pytest.mark.asyncio async def test_weak_query_returns_varied_confidence(config, manager, orchestrator): """ Test that vague queries return varied confidence scores. With calibration, even weak semantic matches can have high confidence if RRF fusion identifies them as top results. The key is that calibration reflects absolute match quality, not relative ranking within weak results. """ create_test_corpus(config, manager) # Vague query with limited semantic signal results, _, _ = await orchestrator.query( "general information overview", top_k=10, top_n=10 ) # Results should span a range of confidence levels # Some may be high (if content matches weakly but consistently) if results: assert len(results) > 0 # Validate all scores are in valid range for result in results: assert 0.0 <= result.score <= 1.0 @pytest.mark.asyncio async def test_nonsense_query_still_returns_results(config, manager, orchestrator): """ Test that nonsense queries return results with calibrated scores. Important finding: Even with nonsense queries, if the system finds *some* documents via semantic/keyword matching (even weak matches), calibration will score them based on RRF fusion strength, not semantic relevance alone. This is expected behavior - calibration measures "how well does this match the fusion algorithm's ranking" not "is this semantically related." """ create_test_corpus(config, manager) # Query unrelated to corpus results, _, _ = await orchestrator.query( "quantum blockchain cryptocurrency mining", top_k=10, top_n=10 ) if results: # System will return *something* even for nonsense # Calibration scores reflect fusion confidence, not semantic accuracy assert len(results) > 0 # All scores should be valid for result in results: assert 0.0 <= result.score <= 1.0 @pytest.mark.asyncio async def test_score_filtering_in_pipeline(config, manager, orchestrator): """ Test that result limiting works with top_n parameter. The orchestrator query method supports top_k and top_n for limiting results. """ create_test_corpus(config, manager) # Query without filtering results_all, _, _ = await orchestrator.query( "security authentication", top_k=10, top_n=10 ) # Get fewer results with higher top_n threshold # (as a proxy for confidence filtering) results_fewer, _, _ = await orchestrator.query( "security authentication", top_k=10, top_n=3 ) # Verify top_n limiting behavior assert len(results_fewer) <= 3, "top_n should limit result count" assert len(results_fewer) <= len(results_all), \ "Fewer results with lower top_n" # Verify all scores are calibrated for results in [results_all, results_fewer]: for result in results: assert 0.0 <= result.score <= 1.0 @pytest.mark.asyncio async def test_score_consistency_across_queries(config, manager, orchestrator): """ Test that similar queries produce consistent confidence scores. Validates that calibration provides stable, reproducible scores for equivalent queries. """ create_test_corpus(config, manager) # Run same query twice results1, _, _ = await orchestrator.query( "API authentication documentation", top_k=10, top_n=5 ) results2, _, _ = await orchestrator.query( "API authentication documentation", top_k=10, top_n=5 ) # Results should be identical (deterministic) assert len(results1) == len(results2) for r1, r2 in zip(results1, results2): assert r1.chunk_id == r2.chunk_id assert abs(r1.score - r2.score) < 0.001, \ f"Scores should be consistent: {r1.score} vs {r2.score}" @pytest.mark.asyncio async def test_confidence_levels_interpretation(config, manager, orchestrator): """ Test that scores map to interpretable confidence levels. Validates the semantic meaning of different score ranges: - >0.9: Excellent match - 0.7-0.9: Good match - 0.5-0.7: Moderate match - 0.3-0.5: Weak match - <0.3: Poor match """ create_test_corpus(config, manager) # High-precision query for excellent match results_excellent, _, _ = await orchestrator.query( "API reference authentication", top_k=5, top_n=5 ) # Moderate query for good match results_good, _, _ = await orchestrator.query( "authentication security", top_k=5, top_n=5 ) # Validate score ranges if results_excellent: top_excellent = results_excellent[0].score # Precise query should yield excellent or good confidence assert top_excellent > 0.7, \ f"Precise query should have >0.7 confidence, got {top_excellent}" if results_good: top_good = results_good[0].score # Moderate query should yield good or moderate confidence assert top_good > 0.5, \ f"Moderate query should have >0.5 confidence, got {top_good}"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_calibration_scenarios.py•9.82 KiB