Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

ragdocs-mcp
tests
integration

test_orchestrator_chunk_result.py•11.5 KiB

import pytest from src.config import ChunkingConfig, Config, IndexingConfig, LLMConfig, SearchConfig, ServerConfig from src.indexing.manager import IndexManager from src.indices.graph import GraphStore from src.indices.keyword import KeywordIndex from src.indices.vector import VectorIndex from src.models import ChunkResult from src.search.orchestrator import SearchOrchestrator from tests.conftest import create_test_document @pytest.fixture def config(tmp_path): return Config( server=ServerConfig(host="127.0.0.1", port=8000), indexing=IndexingConfig( documents_path=str(tmp_path / "docs"), index_path=str(tmp_path / ".index_data"), recursive=False, ), parsers={"**/*.md": "MarkdownParser"}, search=SearchConfig( semantic_weight=1.0, keyword_weight=1.0, recency_bias=0.5, rrf_k_constant=60, ), llm=LLMConfig(embedding_model="local"), document_chunking=ChunkingConfig( strategy="header_based", min_chunk_chars=200, max_chunk_chars=1500, overlap_chars=100, include_parent_headers=True, ), memory_chunking=ChunkingConfig( strategy="header_based", min_chunk_chars=200, max_chunk_chars=1500, overlap_chars=100, include_parent_headers=True, ), ) @pytest.fixture def indices(shared_embedding_model): return { "vector": VectorIndex(embedding_model=shared_embedding_model), "keyword": KeywordIndex(), "graph": GraphStore(), } @pytest.fixture def manager(config, indices): return IndexManager( config, indices["vector"], indices["keyword"], indices["graph"], ) @pytest.fixture def orchestrator(config, indices, manager): return SearchOrchestrator( indices["vector"], indices["keyword"], indices["graph"], config, manager, ) @pytest.mark.asyncio async def test_query_returns_chunk_result_objects(config, manager, orchestrator): """ Test that SearchOrchestrator.query() returns list of ChunkResult objects. Verifies: - Return type is list[ChunkResult] - All items in list are ChunkResult instances - Each result has required fields populated """ from pathlib import Path # Create docs directory docs_dir = Path(config.indexing.documents_path) docs_dir.mkdir(parents=True, exist_ok=True) # Create and index test documents doc1 = create_test_document( docs_dir, "authentication", "# Authentication\n\nHow to configure OAuth 2.0 authentication.", ) doc2 = create_test_document( docs_dir, "security", "# Security\n\nSecurity best practices and configuration.", ) manager.index_document(doc1) manager.index_document(doc2) # Execute query results, _, _ = await orchestrator.query("authentication setup", top_k=10, top_n=5) # Verify return type assert isinstance(results, list) assert all(isinstance(result, ChunkResult) for result in results) # Verify ChunkResult fields are populated if results: result = results[0] assert isinstance(result.chunk_id, str) assert isinstance(result.doc_id, str) assert isinstance(result.score, float) assert isinstance(result.header_path, str) assert isinstance(result.file_path, str) # Verify chunk_id format assert "_chunk_" in result.chunk_id or result.chunk_id @pytest.mark.asyncio async def test_chunk_result_contains_metadata(config, manager, orchestrator): """ Test that ChunkResult objects contain proper metadata from chunks. Verifies: - header_path is populated from chunk metadata - file_path is populated from chunk metadata - doc_id matches chunk_id prefix """ from pathlib import Path # Create docs directory docs_dir = Path(config.indexing.documents_path) docs_dir.mkdir(parents=True, exist_ok=True) # Create document with explicit headers doc = create_test_document( docs_dir, "api_guide", "# API Guide\n\n## Authentication\n\nUse API keys for authentication.\n\n## Authorization\n\nRoles and permissions.", ) manager.index_document(doc) # Query for content that should match results, _, _ = await orchestrator.query("authentication API", top_k=10, top_n=3) assert len(results) > 0 # Check that at least one result has metadata found_metadata = False for result in results: # Verify doc_id is set assert result.doc_id != "" # Verify file_path is populated if available if result.file_path: assert result.file_path.endswith(".md") found_metadata = True # Verify chunk_id contains doc_id if result.doc_id: assert result.doc_id in result.chunk_id or result.chunk_id.startswith(result.doc_id) # At least one result should have metadata assert found_metadata @pytest.mark.asyncio async def test_chunk_result_scores_normalized(config, manager, orchestrator): """ Test that ChunkResult scores are properly normalized. Verifies: - First result has score 1.0 (highest score normalized to 1.0) - Scores are in descending order - All scores are in [0.0, 1.0] range """ from pathlib import Path # Create docs directory docs_dir = Path(config.indexing.documents_path) docs_dir.mkdir(parents=True, exist_ok=True) # Create multiple test documents doc1 = create_test_document(docs_dir, "doc1", "# Document One\n\nAuthentication and security.") doc2 = create_test_document(docs_dir, "doc2", "# Document Two\n\nSecurity practices.") doc3 = create_test_document(docs_dir, "doc3", "# Document Three\n\nAuthentication setup guide.") doc4 = create_test_document(docs_dir, "doc4", "# Document Four\n\nAPI documentation.") for doc_path in [doc1, doc2, doc3, doc4]: manager.index_document(doc_path) # Query that should match multiple documents results, _, _ = await orchestrator.query("authentication security", top_k=10, top_n=10) assert len(results) >= 2, "Should have at least 2 results" # Extract scores scores = [result.score for result in results] # Verify first result has high confidence (calibrated score) assert results[0].score > 0.95, "Highest score should be > 0.95 (high confidence)" # Verify scores are descending for i in range(len(scores) - 1): assert scores[i] >= scores[i+1], f"Scores should be descending: {scores}" # Verify all scores in [0.0, 1.0] for score in scores: assert 0.0 <= score <= 1.0, f"Score {score} out of range" @pytest.mark.asyncio async def test_query_with_missing_chunk_fallback(config, indices, manager, orchestrator): """ Test fallback behavior when chunk metadata is missing. Verifies: - ChunkResult is created even if get_chunk_by_id returns None - header_path and file_path - doc_id is derived from chunk_id """ from pathlib import Path # Create docs directory docs_dir = Path(config.indexing.documents_path) docs_dir.mkdir(parents=True, exist_ok=True) # Create and index a document doc = create_test_document( docs_dir, "fallback_test", "# Fallback Test\n\nTest document for fallback behavior.", ) manager.index_document(doc) # Get initial results to verify normal operation results, _, _ = await orchestrator.query("fallback test", top_k=10, top_n=5) assert len(results) > 0, "Should have at least one result" # Now simulate missing chunk data by querying after clearing chunk data # (In real scenario, this could happen if vector index is corrupted) # We can't easily mock this without changing the orchestrator, so we verify # the fallback logic exists by checking the code path # Verify results have expected structure even in worst case for result in results: assert result.chunk_id != "" assert isinstance(result.score, float) assert 0.0 <= result.score <= 1.0 # If metadata is missing, fields should be empty strings (not None) assert isinstance(result.header_path, str) assert isinstance(result.file_path, str) # doc_id should be derivable from chunk_id even if chunk data is missing if result.doc_id: # Normal case: doc_id is set from chunk data assert isinstance(result.doc_id, str) # If doc_id is empty, fallback should have populated it from chunk_id parsing @pytest.mark.asyncio async def test_chunk_result_serialization_in_pipeline(config, manager, orchestrator): """ Verifies: - ChunkResult can be created from query results - to_dict() produces JSON-serializable output - Serialized format matches API response expectations """ from pathlib import Path # Create docs directory docs_dir = Path(config.indexing.documents_path) docs_dir.mkdir(parents=True, exist_ok=True) # Create test document doc = create_test_document( docs_dir, "serialization_test", "# Serialization Test\n\nTest document content for serialization.", ) manager.index_document(doc) # Execute query results, _, _ = await orchestrator.query("serialization test", top_k=5, top_n=3) assert len(results) > 0 # Serialize results results_dict = [result.to_dict() for result in results] # Verify serialization format assert isinstance(results_dict, list) assert all(isinstance(item, dict) for item in results_dict) # Verify each dict has required keys for item in results_dict: assert "chunk_id" in item assert "doc_id" in item assert "score" in item assert "header_path" in item assert "file_path" in item # Verify types in serialized form assert isinstance(item["chunk_id"], str) assert isinstance(item["doc_id"], str) assert isinstance(item["score"], float) assert isinstance(item["header_path"], str) assert isinstance(item["file_path"], str) @pytest.mark.asyncio async def test_chunk_result_with_complex_headers(config, manager, orchestrator): """ Verifies: - Nested headers are captured in header_path - Header path format is consistent - Multiple levels of nesting work correctly """ from pathlib import Path # Create docs directory docs_dir = Path(config.indexing.documents_path) docs_dir.mkdir(parents=True, exist_ok=True) # Create document with nested headers doc = create_test_document( docs_dir, "complex_doc", """# Main Title ## Section One ### Subsection A Content in subsection A with important information. ### Subsection B More content here. ## Section Two ### Subsection C Additional details. """, ) manager.index_document(doc) # Query for content that should match results, _, _ = await orchestrator.query("subsection important information", top_k=10, top_n=5) assert len(results) > 0 # At least one result should have a header_path with multiple levels for result in results: if " > " in result.header_path or ">" in result.header_path: # Verify header path structure assert isinstance(result.header_path, str) assert len(result.header_path) > 0 break # Complex documents should have nested headers # (This may not always be true depending on chunking strategy)

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_orchestrator_chunk_result.py•11.5 KiB