Code-Index-MCP

test_semantic_document_integration.py•25.5 KiB

"""Integration tests for semantic indexer with document processing.""" import hashlib import logging import os import time from unittest.mock import MagicMock, patch import numpy as np import pytest from mcp_server.document_processing.document_interfaces import ( ChunkMetadata, ChunkType, DocumentChunk, ) from mcp_server.plugins.markdown_plugin.plugin import MarkdownPlugin from mcp_server.utils.semantic_indexer import SemanticIndexer from tests.base_test import BaseDocumentTest logger = logging.getLogger(__name__) class TestSemanticDocumentIntegration(BaseDocumentTest): """Test semantic indexer integration with document processing system.""" @pytest.fixture def mock_qdrant_client(self): """Create a mock Qdrant client.""" with patch("mcp_server.utils.semantic_indexer.QdrantClient") as mock: client = MagicMock() mock.return_value = client # Mock collection operations client.get_collections.return_value.collections = [] client.create_collection.return_value = True client.upsert.return_value = True client.search.return_value = [] yield client @pytest.fixture def mock_voyage_client(self): """Create a mock Voyage AI client.""" with patch("mcp_server.utils.semantic_indexer.voyageai.Client") as mock: client = MagicMock() mock.return_value = client # Mock embedding generation def mock_embed(texts, model, input_type=None): # Generate deterministic fake embeddings embeddings = [] for text in texts: # Create a simple hash-based embedding hash_val = int(hashlib.md5(text.encode()).hexdigest()[:8], 16) embedding = [(hash_val >> i & 1) * 0.1 for i in range(1024)] embeddings.append(embedding) return MagicMock(embeddings=embeddings) client.embed.side_effect = mock_embed yield client @pytest.fixture def semantic_indexer(self, mock_qdrant_client, mock_voyage_client): """Create semantic indexer with mocked clients.""" with patch.dict(os.environ, {"VOYAGE_API_KEY": "test-key"}): indexer = SemanticIndexer( collection_name="test_collection", qdrant_host="localhost", qdrant_port=6333 ) return indexer def test_semantic_indexer_processes_document_chunks(self, semantic_indexer): """Test that semantic indexer correctly processes document chunks.""" # Create document chunks chunks = [ DocumentChunk( id="chunk1", content="# Installation Guide\n\nInstall using pip: pip install mypackage", type=ChunkType.HEADING, metadata=ChunkMetadata( document_path=str(self.workspace / "README.md"), section_hierarchy=["Installation Guide"], chunk_index=0, total_chunks=3, has_code=True, word_count=10, ), ), DocumentChunk( id="chunk2", content="## Requirements\n\n- Python 3.8+\n- pip package manager", type=ChunkType.HEADING, metadata=ChunkMetadata( document_path=str(self.workspace / "README.md"), section_hierarchy=["Installation Guide", "Requirements"], chunk_index=1, total_chunks=3, has_code=False, word_count=8, ), ), DocumentChunk( id="chunk3", content="```python\nfrom mypackage import Client\nclient = Client()\n```", type=ChunkType.CODE_BLOCK, metadata=ChunkMetadata( document_path=str(self.workspace / "README.md"), section_hierarchy=["Installation Guide", "Usage"], chunk_index=2, total_chunks=3, has_code=True, language="python", word_count=6, ), ), ] # Index chunks for chunk in chunks: semantic_indexer.index_document_chunk( chunk_id=chunk.id, content=chunk.content, file_path=chunk.metadata.document_path, chunk_metadata={ "type": chunk.type.value, "section_hierarchy": chunk.metadata.section_hierarchy, "has_code": chunk.metadata.has_code, "language": chunk.metadata.language, }, ) # Verify indexing calls assert semantic_indexer._qdrant_client.upsert.called call_args = semantic_indexer._qdrant_client.upsert.call_args_list assert len(call_args) >= len(chunks) def test_semantic_search_with_document_context(self, semantic_indexer, mock_qdrant_client): """Test semantic search with document context awareness.""" # Setup mock search results mock_results = [ MagicMock( id="chunk1", score=0.95, payload={ "content": "Install using pip: pip install mypackage", "file_path": "/docs/README.md", "type": "document", "metadata": { "type": "heading", "section_hierarchy": ["Installation Guide"], "has_code": True, }, }, ), MagicMock( id="chunk2", score=0.85, payload={ "content": "pip is the package installer for Python", "file_path": "/docs/glossary.md", "type": "document", "metadata": { "type": "paragraph", "section_hierarchy": ["Glossary", "Package Management"], }, }, ), ] mock_qdrant_client.search.return_value = mock_results # Perform search results = semantic_indexer.search_documents( query="how to install with pip", limit=10, file_pattern="*.md" ) # Verify search was called with document type mock_qdrant_client.search.assert_called() # Verify results are properly formatted assert len(results) == 2 assert results[0]["score"] > results[1]["score"] # Higher score first assert "Installation Guide" in str(results[0]) def test_semantic_indexer_document_type_weighting(self, semantic_indexer): """Test that document types receive appropriate weight boosts.""" # Test weight calculation for different document types readme_weight = semantic_indexer._get_document_weight("README.md", "markdown") assert readme_weight > 1.0 # README should get boost md_weight = semantic_indexer._get_document_weight("docs/api.md", "markdown") assert md_weight >= 1.0 # Markdown docs should get boost txt_weight = semantic_indexer._get_document_weight("notes.txt", "plaintext") assert txt_weight == 1.0 # Plain text gets standard weight def test_integration_with_markdown_plugin(self, semantic_indexer): """Test semantic indexer integration with markdown plugin.""" # Create a markdown file md_file = self.workspace / "guide.md" content = """# User Guide ## Getting Started Follow these steps to get started: 1. Install the package 2. Configure settings 3. Run the application ## Advanced Usage ### Custom Configuration You can customize the behavior by modifying the config file: ```yaml server: host: localhost port: 8080 options: debug: true ``` ### API Integration Use our REST API for programmatic access. """ md_file.write_text(content) # Create plugin with semantic indexer with patch.dict(os.environ, {"SEMANTIC_SEARCH_ENABLED": "true"}): plugin = MarkdownPlugin(sqlite_store=self.store, enable_semantic=True) # Inject our mocked indexer plugin._semantic_indexer = semantic_indexer # Index the file shard = plugin.index([str(md_file)]) assert shard is not None # Verify semantic indexing was called assert semantic_indexer._voyage_client.embed.called def test_semantic_search_respects_file_patterns(self, semantic_indexer, mock_qdrant_client): """Test that semantic search correctly filters by file patterns.""" # Setup mock results from different file types mock_results = [ MagicMock( id="1", score=0.9, payload={ "content": "Configuration guide", "file_path": "/docs/config.md", "type": "document", }, ), MagicMock( id="2", score=0.85, payload={ "content": "Configuration example", "file_path": "/examples/config.py", "type": "code", }, ), MagicMock( id="3", score=0.8, payload={ "content": "Config notes", "file_path": "/notes/config.txt", "type": "document", }, ), ] mock_qdrant_client.search.return_value = mock_results # Search with file pattern filter md_results = semantic_indexer.search_documents( query="configuration", limit=10, file_pattern="*.md" ) # Should only return .md files assert all(r.get("file_path", "").endswith(".md") for r in md_results) def test_document_section_extraction(self, semantic_indexer): """Test extraction of document sections for hierarchical indexing.""" markdown_content = """# Main Title ## Section 1 Content for section 1. ### Subsection 1.1 Details about subsection. ## Section 2 Another main section. ### Subsection 2.1 More details. #### Subsubsection 2.1.1 Deep nesting example. """ # Extract sections sections = semantic_indexer.extract_markdown_sections(markdown_content) # Verify section hierarchy assert len(sections) > 0 # Find main sections main_sections = [s for s in sections if s.level == 2] assert len(main_sections) == 2 assert main_sections[0].title == "Section 1" assert main_sections[1].title == "Section 2" # Check subsections subsections = [s for s in sections if s.level == 3] assert len(subsections) >= 2 # Verify deep nesting deep_sections = [s for s in sections if s.level == 4] assert len(deep_sections) >= 1 assert deep_sections[0].title == "Subsubsection 2.1.1" def test_semantic_indexer_handles_large_documents(self, semantic_indexer): """Test semantic indexer with large documents that need chunking.""" # Create a large document sections = [] for i in range(20): sections.append( f"""## Section {i} This is the content for section {i}. It contains multiple paragraphs to simulate a real document with substantial content. Here's another paragraph with more details about topic {i}. It includes various technical terms and explanations. """ ) large_content = "# Large Document\n\n" + "\n".join(sections) # Index the large document semantic_indexer.index_document_content( content=large_content, file_path=str(self.workspace / "large.md"), doc_type="markdown" ) # Verify chunking happened assert semantic_indexer._voyage_client.embed.called # Check that multiple chunks were created embed_calls = semantic_indexer._voyage_client.embed.call_count assert embed_calls > 1 # Should have multiple embedding calls for chunks def test_semantic_cache_integration(self, semantic_indexer): """Test that semantic indexer integrates with caching layer.""" # Create a document doc_path = str(self.workspace / "cached.md") content = "# Cached Document\n\nThis content should be cached." # Index document twice semantic_indexer.index_document_content(content, doc_path, "markdown") semantic_indexer.index_document_content(content, doc_path, "markdown") # Second indexing should use cache (fewer embed calls) embed_calls = semantic_indexer._voyage_client.embed.call_count # With caching, we shouldn't have double the calls assert embed_calls < 4 # Would be 4+ without caching def test_semantic_embeddings_quality(self, semantic_indexer, mock_voyage_client): """Test quality and consistency of semantic embeddings.""" # Create similar documents similar_docs = [ ("doc1.md", "# Machine Learning\n\nNeural networks and deep learning concepts."), ("doc2.md", "# Deep Learning\n\nNeural network architectures and training."), ("doc3.md", "# AI Models\n\nMachine learning and neural network models."), ] # Create dissimilar document dissimilar_doc = ("cooking.md", "# Cooking Recipe\n\nHow to make chocolate cake.") # Generate embeddings for all embeddings = {} for filename, content in similar_docs + [dissimilar_doc]: file_path = str(self.workspace / filename) semantic_indexer.index_document_content(content, file_path, "markdown") # Extract embedding from mock calls call_args = mock_voyage_client.embed.call_args_list[-1] embeddings[filename] = np.array(call_args[0][0]) # First text's embedding # Calculate cosine similarities def cosine_similarity(a, b): return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) # Similar documents should have high similarity sim_1_2 = cosine_similarity(embeddings["doc1.md"], embeddings["doc2.md"]) sim_1_3 = cosine_similarity(embeddings["doc1.md"], embeddings["doc3.md"]) sim_2_3 = cosine_similarity(embeddings["doc2.md"], embeddings["doc3.md"]) # Dissimilar document should have low similarity sim_1_cook = cosine_similarity(embeddings["doc1.md"], embeddings["cooking.md"]) sim_2_cook = cosine_similarity(embeddings["doc2.md"], embeddings["cooking.md"]) # Assert relationships (even with mock embeddings, structure should hold) # Similar docs should be more similar to each other than to dissimilar doc assert sim_1_2 > sim_1_cook or sim_1_3 > sim_1_cook assert sim_2_3 > sim_2_cook def test_semantic_search_with_synonyms(self, semantic_indexer, mock_qdrant_client): """Test semantic search finds documents with synonyms and related terms.""" # Setup mock results for synonym search mock_results = [ MagicMock( id="doc1", score=0.92, payload={ "content": "Software installation guide", "file_path": "/docs/setup.md", "type": "document", }, ), MagicMock( id="doc2", score=0.88, payload={ "content": "Application deployment instructions", "file_path": "/docs/deploy.md", "type": "document", }, ), MagicMock( id="doc3", score=0.85, payload={ "content": "Program setup tutorial", "file_path": "/docs/tutorial.md", "type": "document", }, ), ] mock_qdrant_client.search.return_value = mock_results # Search with synonym terms results = semantic_indexer.search_documents( query="software deployment setup", limit=10 # Mix of synonyms ) # Should find all related documents assert len(results) == 3 assert all(r["score"] > 0.8 for r in results) def test_semantic_indexer_chunk_boundaries(self, semantic_indexer): """Test that semantic indexer respects chunk boundaries and context.""" # Create document with clear section boundaries document = """# User Manual ## Chapter 1: Getting Started This chapter covers the basics of getting started with our application. You'll learn how to install and configure the software. ### Installation Steps 1. Download the installer 2. Run the setup wizard 3. Configure initial settings ## Chapter 2: Advanced Features This chapter explores advanced features and customization options. Power users will find detailed configuration instructions here. ### Custom Workflows Create custom workflows to automate repetitive tasks. Use the workflow editor to design your automation. ## Chapter 3: Troubleshooting Common issues and their solutions are covered in this chapter. Find answers to frequently asked questions. ### Error Messages Understanding and resolving common error messages. Diagnostic tools and debugging techniques. """ # Index document semantic_indexer.index_document_content( document, str(self.workspace / "manual.md"), "markdown" ) # Verify chunking preserved section boundaries embed_calls = semantic_indexer._voyage_client.embed.call_args_list # Should have multiple chunks assert len(embed_calls) > 1 # Each chunk should contain coherent content for call in embed_calls: chunk_text = call[0][0][0] if isinstance(call[0][0], list) else call[0][0] # Chunks should not split mid-sentence assert not chunk_text.endswith((" the", " a", " an", " of")) def test_semantic_vector_search_accuracy(self, semantic_indexer, mock_qdrant_client): """Test accuracy of vector-based semantic search.""" # Create test documents with varying semantic similarity test_docs = [ { "id": "exact", "content": "How to configure database connection settings", "score": 0.98, }, {"id": "similar", "content": "Database configuration and setup guide", "score": 0.92}, {"id": "related", "content": "SQL server connection parameters", "score": 0.85}, {"id": "distant", "content": "Web server configuration", "score": 0.65}, ] # Mock search results in order of relevance mock_results = [ MagicMock( id=doc["id"], score=doc["score"], payload={"content": doc["content"], "file_path": f'/{doc["id"]}.md'}, ) for doc in test_docs ] mock_qdrant_client.search.return_value = mock_results[:3] # Top 3 # Perform search results = semantic_indexer.search_documents( query="database connection configuration", limit=3 ) # Verify ranking order assert len(results) == 3 assert results[0]["score"] > results[1]["score"] assert results[1]["score"] > results[2]["score"] # Most relevant should score highly assert results[0]["score"] > 0.9 def test_semantic_indexer_multilingual_support(self, semantic_indexer): """Test semantic indexer with multilingual content.""" # Create multilingual documents multilingual_docs = [ ("english.md", "# Documentation\n\nThis is English documentation."), ("spanish.md", "# Documentación\n\nEsta es documentación en español."), ("french.md", "# Documentation\n\nCeci est une documentation en français."), ( "mixed.md", "# Mixed Language\n\nThis document contains English, español, and français.", ), ] # Index all documents for filename, content in multilingual_docs: file_path = str(self.workspace / filename) semantic_indexer.index_document_content(content, file_path, "markdown") # Verify all documents were indexed assert semantic_indexer._voyage_client.embed.call_count >= len(multilingual_docs) # Each language should be processed embed_calls = semantic_indexer._voyage_client.embed.call_args_list texts_indexed = [] for call in embed_calls: if isinstance(call[0][0], list): texts_indexed.extend(call[0][0]) else: texts_indexed.append(call[0][0]) # Should contain text from all languages assert any("English" in text for text in texts_indexed) assert any("español" in text for text in texts_indexed) assert any("français" in text for text in texts_indexed) def test_semantic_indexer_performance_optimization(self, semantic_indexer): """Test performance optimizations in semantic indexer.""" # Create documents of varying sizes small_doc = "Small content." medium_doc = "Medium document. " * 100 # ~1.5KB large_doc = "Large document content. " * 1000 # ~20KB # Track indexing times times = {} # Index small document start = time.time() semantic_indexer.index_document_content( small_doc, str(self.workspace / "small.md"), "markdown" ) times["small"] = time.time() - start # Index medium document start = time.time() semantic_indexer.index_document_content( medium_doc, str(self.workspace / "medium.md"), "markdown" ) times["medium"] = time.time() - start # Index large document start = time.time() semantic_indexer.index_document_content( large_doc, str(self.workspace / "large.md"), "markdown" ) times["large"] = time.time() - start # Performance should scale reasonably # Large doc shouldn't take disproportionately longer assert times["large"] < times["small"] * 100 # Not 1000x slower def test_semantic_search_with_metadata_filters(self, semantic_indexer, mock_qdrant_client): """Test semantic search with metadata-based filtering.""" # Setup mock results with different metadata mock_results = [ MagicMock( id="recent1", score=0.95, payload={ "content": "Recent API documentation", "file_path": "/docs/api_v2.md", "metadata": {"version": "2.0", "date": "2024-01-15"}, }, ), MagicMock( id="old1", score=0.93, payload={ "content": "Legacy API documentation", "file_path": "/docs/api_v1.md", "metadata": {"version": "1.0", "date": "2023-01-15"}, }, ), MagicMock( id="recent2", score=0.90, payload={ "content": "Current API guide", "file_path": "/docs/guide.md", "metadata": {"version": "2.0", "date": "2024-01-10"}, }, ), ] mock_qdrant_client.search.return_value = mock_results # Search with metadata preferences results = semantic_indexer.search_documents( query="API documentation", limit=10, metadata_filter={"version": "2.0"}, # Only recent versions ) # Should prioritize recent versions assert len(results) >= 2 # Results should be filtered or ranked by metadata for result in results[:2]: if "metadata" in result.get("payload", {}): assert result["payload"]["metadata"].get("version") == "2.0" def test_semantic_indexer_error_recovery(self, semantic_indexer, mock_voyage_client): """Test semantic indexer error handling and recovery.""" # Simulate embedding API failure error_count = 0 original_embed = mock_voyage_client.embed.side_effect def failing_embed(*args, **kwargs): nonlocal error_count error_count += 1 if error_count <= 2: raise Exception("Embedding API temporarily unavailable") return original_embed(*args, **kwargs) mock_voyage_client.embed.side_effect = failing_embed # Try to index document (should retry and eventually succeed) try: semantic_indexer.index_document_content( "Test content for error recovery", str(self.workspace / "test.md"), "markdown" ) except Exception: # If it fails completely, that's OK for this test pass # Should have attempted multiple times assert error_count >= 2 # Reset for next operation error_count = 0 mock_voyage_client.embed.side_effect = original_embed # Next operation should work normally semantic_indexer.index_document_content( "Recovery test content", str(self.workspace / "recovery.md"), "markdown" ) assert mock_voyage_client.embed.called if __name__ == "__main__": pytest.main([__file__, "-v"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/ViperJuice/Code-Index-MCP'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_semantic_document_integration.py•25.5 KiB