Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

test_query_expansion.py•15.7 KiB

""" Integration tests for query expansion via embeddings. Tests cover: - Building concept vocabulary from empty/populated index - Term extraction and stopword filtering - Query expansion with nearest terms - Duplicate term handling - Vocabulary persistence Test strategies: - Use real embedding model (shared_embedding_model from conftest) - Test vocabulary building with known document content - Verify expansion behavior with real embeddings """ from datetime import datetime import pytest from src.indices.vector import VectorIndex, STOPWORDS from src.models import Chunk # ============================================================================ # Fixtures # ============================================================================ @pytest.fixture def vector_index_empty(shared_embedding_model): """ Create empty VectorIndex with shared embedding model. """ return VectorIndex(embedding_model=shared_embedding_model) @pytest.fixture def vector_index_populated(shared_embedding_model): """ Create VectorIndex populated with test chunks. Contains chunks about authentication, authorization, and login. """ index = VectorIndex(embedding_model=shared_embedding_model) chunks = [ Chunk( chunk_id="doc1_chunk_0", doc_id="doc1", content="Authentication is the process of verifying user identity.", metadata={"tags": [], "links": []}, chunk_index=0, header_path="Security > Authentication", start_pos=0, end_pos=60, file_path="/docs/auth.md", modified_time=datetime.now(), ), Chunk( chunk_id="doc2_chunk_0", doc_id="doc2", content="Authorization determines what a user can access after login.", metadata={"tags": [], "links": []}, chunk_index=0, header_path="Security > Authorization", start_pos=0, end_pos=60, file_path="/docs/authz.md", modified_time=datetime.now(), ), Chunk( chunk_id="doc3_chunk_0", doc_id="doc3", content="The login page allows users to enter credentials for authentication.", metadata={"tags": [], "links": []}, chunk_index=0, header_path="User Interface > Login", start_pos=0, end_pos=70, file_path="/docs/login.md", modified_time=datetime.now(), ), ] for chunk in chunks: index.add_chunk(chunk) return index # ============================================================================ # Build Vocabulary - Empty Index Tests # ============================================================================ class TestBuildVocabularyEmptyIndex: """Tests for vocabulary building with empty index.""" def test_build_vocabulary_empty_index_produces_empty_vocab(self, vector_index_empty): """ Empty index produces empty vocabulary. No documents means no terms to extract. """ vector_index_empty.build_concept_vocabulary() assert vector_index_empty._concept_vocabulary == {} def test_build_vocabulary_empty_index_no_errors(self, vector_index_empty): """ Building vocabulary on empty index doesn't raise errors. Graceful handling of edge case. """ # Should not raise vector_index_empty.build_concept_vocabulary() # ============================================================================ # Build Vocabulary - Term Extraction Tests # ============================================================================ class TestBuildVocabularyExtractsTerms: """Tests for term extraction from documents.""" def test_build_vocabulary_extracts_terms(self, vector_index_populated): """ Extracts terms from indexed documents. Vocabulary should contain terms from chunk content. """ vector_index_populated.build_concept_vocabulary() vocab = vector_index_populated._concept_vocabulary assert len(vocab) > 0 # Should contain key terms from content vocab_terms = set(vocab.keys()) assert "authentication" in vocab_terms assert "authorization" in vocab_terms assert "login" in vocab_terms assert "user" in vocab_terms def test_build_vocabulary_respects_min_term_length(self, vector_index_populated): """ Filters out terms shorter than min_term_length. Short terms like "a", "is" should be excluded. """ vector_index_populated.build_concept_vocabulary(min_term_length=4) vocab_terms = set(vector_index_populated._concept_vocabulary.keys()) # No terms shorter than 4 chars for term in vocab_terms: assert len(term) >= 4 def test_build_vocabulary_respects_max_terms(self, vector_index_populated): """ Limits vocabulary to max_terms most frequent terms. Tests vocabulary size constraint. """ vector_index_populated.build_concept_vocabulary(max_terms=5) assert len(vector_index_populated._concept_vocabulary) <= 5 # ============================================================================ # Build Vocabulary - Stopword Filtering Tests # ============================================================================ class TestBuildVocabularyFiltersStopwords: """Tests for stopword filtering in vocabulary building.""" def test_build_vocabulary_filters_stopwords(self, vector_index_populated): """ Stopwords not included in vocabulary. Common words like "the", "is", "a" should be excluded. """ vector_index_populated.build_concept_vocabulary() vocab_terms = set(vector_index_populated._concept_vocabulary.keys()) # None of the stopwords should be in vocabulary for stopword in ["the", "is", "a", "of", "to", "and", "for"]: assert stopword not in vocab_terms def test_stopwords_constant_includes_common_words(self): """ STOPWORDS constant includes common English words. Verifies the stopword list is properly defined. """ common_stopwords = {"the", "a", "an", "is", "are", "was", "were", "be"} assert common_stopwords.issubset(STOPWORDS) # ============================================================================ # Query Expansion - Basic Tests # ============================================================================ class TestExpandQueryAddsTerms: """Tests for basic query expansion behavior.""" def test_expand_query_adds_terms(self, vector_index_populated): """ Expansion adds related terms to query. Query should be augmented with semantically similar terms. """ vector_index_populated.build_concept_vocabulary() original_query = "auth" expanded_query = vector_index_populated.expand_query( original_query, top_k=3, similarity_threshold=0.0, # Low threshold to ensure expansion ) # Expanded query should be longer assert len(expanded_query) > len(original_query) # Original query terms preserved assert "auth" in expanded_query.lower() def test_expand_query_returns_string(self, vector_index_populated): """ Expanded query is a string. Type verification for API contract. """ vector_index_populated.build_concept_vocabulary() result = vector_index_populated.expand_query("authentication") assert isinstance(result, str) # ============================================================================ # Query Expansion - No Duplicate Terms Tests # ============================================================================ class TestExpandQueryNoDuplicateTerms: """Tests for duplicate term handling in expansion.""" def test_expand_query_no_duplicate_terms(self, vector_index_populated): """ Terms already in query not added again. Prevents redundant expansion. """ vector_index_populated.build_concept_vocabulary() # Query already contains "authentication" original_query = "authentication process" expanded_query = vector_index_populated.expand_query( original_query, top_k=5, similarity_threshold=0.0, ) # Count occurrences of "authentication" tokens = expanded_query.lower().split() auth_count = tokens.count("authentication") # Should only appear once (from original query) assert auth_count <= 1 # ============================================================================ # Query Expansion - Empty Vocabulary Tests # ============================================================================ class TestExpandQueryEmptyVocabulary: """Tests for expansion with empty vocabulary.""" def test_expand_query_empty_vocabulary_returns_original(self, vector_index_empty): """ Returns original query if no vocabulary. Graceful degradation when vocabulary not built. """ # Don't build vocabulary original_query = "test query" result = vector_index_empty.expand_query(original_query) assert result == original_query def test_expand_query_empty_vocabulary_no_errors(self, vector_index_empty): """ No errors when expanding with empty vocabulary. Safe fallback behavior. """ # Should not raise result = vector_index_empty.expand_query("test query") assert isinstance(result, str) # ============================================================================ # Vocabulary Persistence Tests # ============================================================================ class TestVocabularyPersistence: """Tests for vocabulary save and load.""" def test_vocabulary_persistence_saved(self, vector_index_populated, tmp_path): """ Vocabulary saved to disk during persist. Verifies file is created. """ vector_index_populated.build_concept_vocabulary() persist_path = tmp_path / "index" vector_index_populated.persist(persist_path) vocab_file = persist_path / "concept_vocabulary.json" assert vocab_file.exists() def test_vocabulary_persistence_loaded(self, shared_embedding_model, tmp_path): """ Vocabulary loaded from disk correctly. Full round-trip test: save and load. """ # Create and populate first index index1 = VectorIndex(embedding_model=shared_embedding_model) chunk = Chunk( chunk_id="doc1_chunk_0", doc_id="doc1", content="Authentication security credentials verification process.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=60, file_path="/docs/test.md", modified_time=datetime.now(), ) index1.add_chunk(chunk) index1.build_concept_vocabulary(min_frequency=1) persist_path = tmp_path / "index" index1.persist(persist_path) # Create second index and load index2 = VectorIndex(embedding_model=shared_embedding_model) index2.load(persist_path) # Vocabulary should be loaded assert len(index2._concept_vocabulary) > 0 assert "authentication" in index2._concept_vocabulary or "security" in index2._concept_vocabulary def test_vocabulary_persistence_missing_file(self, shared_embedding_model, tmp_path): """ Handles missing vocabulary file gracefully. When loading an index that was persisted without a vocabulary, the vocabulary should remain empty. """ persist_path = tmp_path / "index" # Create a real index and persist it WITHOUT building vocabulary index1 = VectorIndex(embedding_model=shared_embedding_model) chunk = Chunk( chunk_id="doc1_chunk_0", doc_id="doc1", content="Test content for index.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=30, file_path="/docs/test.md", modified_time=datetime.now(), ) index1.add_chunk(chunk) # Note: NOT calling build_concept_vocabulary() index1.persist(persist_path) # Now delete the vocabulary file if it exists vocab_file = persist_path / "concept_vocabulary.json" if vocab_file.exists(): vocab_file.unlink() # Load into new index - should handle missing vocab file index2 = VectorIndex(embedding_model=shared_embedding_model) index2.load(persist_path) # Should have empty vocabulary (not crash) assert index2._concept_vocabulary == {} # ============================================================================ # Query Expansion - Similarity Threshold Tests # ============================================================================ class TestExpandQuerySimilarityThreshold: """Tests for similarity threshold in expansion.""" def test_expand_query_high_threshold_fewer_terms(self, vector_index_populated): """ High similarity threshold results in fewer expansion terms. Stricter matching = less expansion. """ vector_index_populated.build_concept_vocabulary() query = "user" expanded_low = vector_index_populated.expand_query( query, top_k=5, similarity_threshold=0.0, ) expanded_high = vector_index_populated.expand_query( query, top_k=5, similarity_threshold=0.9, ) # High threshold should have fewer or equal terms assert len(expanded_high.split()) <= len(expanded_low.split()) def test_expand_query_threshold_one_no_expansion(self, vector_index_populated): """ Threshold of 1.0 means no expansion (impossible to match). Only exact embedding match would pass. """ vector_index_populated.build_concept_vocabulary() query = "test query" expanded = vector_index_populated.expand_query( query, top_k=5, similarity_threshold=1.0, ) # Should return original query unchanged assert expanded == query # ============================================================================ # Query Expansion - top_k Tests # ============================================================================ class TestExpandQueryTopK: """Tests for top_k parameter in expansion.""" def test_expand_query_respects_top_k(self, vector_index_populated): """ Expansion limited to top_k terms. At most top_k new terms added. """ vector_index_populated.build_concept_vocabulary() query = "security" top_k = 2 expanded = vector_index_populated.expand_query( query, top_k=top_k, similarity_threshold=0.0, ) # Count new terms (expanded - original) original_tokens = set(query.lower().split()) expanded_tokens = set(expanded.lower().split()) new_tokens = expanded_tokens - original_tokens assert len(new_tokens) <= top_k def test_expand_query_top_k_zero_no_expansion(self, vector_index_populated): """ top_k=0 means no expansion terms added. """ vector_index_populated.build_concept_vocabulary() query = "test" expanded = vector_index_populated.expand_query( query, top_k=0, similarity_threshold=0.0, ) assert expanded == query

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_query_expansion.py•15.7 KiB