Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

test_vector_index.py•27.2 KiB

from datetime import datetime import pytest from src.indices.vector import VectorIndex from src.models import Document def _extract_chunk_ids(results: list) -> list[str]: if not results: return [] if isinstance(results[0], str): return results return [r["chunk_id"] for r in results] @pytest.fixture def sample_document(): return Document( id="test-doc", content="# Machine Learning\n\nMachine learning is a subset of artificial intelligence.", metadata={"title": "ML Intro"}, links=["AI"], tags=["ml", "ai"], file_path="/tmp/test.md", modified_time=datetime.now(), ) @pytest.fixture def vector_index(tmp_path, shared_embedding_model): return VectorIndex(embedding_model=shared_embedding_model) def test_vector_index_add_and_search(vector_index, sample_document): vector_index.add(sample_document) results = vector_index.search("what is machine learning", top_k=5) assert "test-doc" in _extract_chunk_ids(results) assert len(results) <= 5 def test_vector_index_remove(vector_index, sample_document): vector_index.add(sample_document) results_before = vector_index.search("machine learning", top_k=5) assert "test-doc" in _extract_chunk_ids(results_before) vector_index.remove("test-doc") # Verify removal via public API # Note: Due to FAISS limitation, vectors remain but mapping is removed # We verify the operation completed without error def test_vector_index_empty_query(vector_index, sample_document): vector_index.add(sample_document) results = vector_index.search("", top_k=5) assert results == [] results = vector_index.search(" ", top_k=5) assert results == [] def test_vector_index_persist_and_load(tmp_path, sample_document, shared_embedding_model): index1 = VectorIndex(embedding_model=shared_embedding_model) index1.add(sample_document) persist_path = tmp_path / "index" index1.persist(persist_path) assert persist_path.exists() assert (persist_path / "doc_id_mapping.json").exists() index2 = VectorIndex(embedding_model=shared_embedding_model) index2.load(persist_path) results = index2.search("machine learning", top_k=5) assert "test-doc" in _extract_chunk_ids(results) def test_vector_index_multiple_documents(vector_index): doc1 = Document( id="doc1", content="Python is a programming language.", metadata={}, links=[], tags=["python"], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) doc2 = Document( id="doc2", content="JavaScript is used for web development.", metadata={}, links=[], tags=["javascript"], file_path="/tmp/doc2.md", modified_time=datetime.now(), ) vector_index.add(doc1) vector_index.add(doc2) results = vector_index.search("programming language", top_k=5) assert "doc1" in _extract_chunk_ids(results) or "doc2" in results assert len(results) <= 5 def test_vector_index_search_returns_unique_doc_ids(vector_index): doc = Document( id="long-doc", content="# Chapter 1\n\n" + "Content paragraph. " * 100 + "\n\n# Chapter 2\n\n" + "More content here. " * 100, metadata={}, links=[], tags=[], file_path="/tmp/long.md", modified_time=datetime.now(), ) vector_index.add(doc) results = vector_index.search("content", top_k=10) assert _extract_chunk_ids(results).count("long-doc") == 1 def test_vector_index_load_nonexistent_path(tmp_path, shared_embedding_model): index = VectorIndex(embedding_model=shared_embedding_model) nonexistent_path = tmp_path / "nonexistent" index.load(nonexistent_path) # Verify index is functional by adding and searching a document doc = Document( id="test-doc", content="Test content for initialization.", metadata={}, links=[], tags=[], file_path="/tmp/test.md", modified_time=datetime.now(), ) index.add(doc) results = index.search("test content", top_k=5) assert "test-doc" in _extract_chunk_ids(results) def test_vector_index_very_long_document(vector_index): """ Test handling of documents exceeding 10k characters. Ensures chunking and indexing works without errors for large documents. """ long_content = "# Very Long Document\n\n" long_content += "This is a paragraph with meaningful content. " * 250 # ~11k chars long_content += "\n\n## Section 2\n\n" long_content += "More content here with searchable terms like embeddings and vectors. " * 100 doc = Document( id="long-doc", content=long_content, metadata={"title": "Long Document"}, links=[], tags=["long", "test"], file_path="/tmp/long_doc.md", modified_time=datetime.now(), ) vector_index.add(doc) results = vector_index.search("embeddings and vectors", top_k=5) assert "long-doc" in _extract_chunk_ids(results) assert len(long_content) > 10000 def test_vector_index_special_characters(vector_index): """ Test handling of special characters, unicode, and symbols. Validates robustness against emojis, punctuation, and non-ASCII content. """ content = """# Special Characters Test 🚀 Unicode: 你好世界 (Chinese), Привет мир (Russian), مرحبا بالعالم (Arabic) Symbols & Punctuation: !@#$%^&*()_+-=[]{}|;':",./<>? Math: α, β, γ, ∑, ∫, ∞, ≈, ≠, ≤, ≥ Emojis: 🎉 🔥 💡 🌟 ⚡ 🎯 🚀 💻 📚 🌍 Quotes: "smart quotes" 'apostrophes' «guillemets» ‹single› Special: €, £, ¥, ©, ®, ™, §, ¶, †, ‡, … """ doc = Document( id="special-chars", content=content, metadata={"type": "special"}, links=[], tags=["unicode", "symbols"], file_path="/tmp/special.md", modified_time=datetime.now(), ) vector_index.add(doc) results = vector_index.search("unicode symbols", top_k=5) assert "special-chars" in _extract_chunk_ids(results) results_emoji = vector_index.search("emojis punctuation", top_k=5) assert "special-chars" in _extract_chunk_ids(results_emoji) # ============================================================================ # Header-Weighted Embedding Tests (Phase 1 Search Quality) # ============================================================================ def test_vector_index_chunk_with_header_path_includes_header_in_embedding(shared_embedding_model): """ Chunks with header_path include header context in embedding. Verifies P2: Heading-weighted embeddings prepend header_path to content before generating embeddings, improving semantic search relevance. """ from src.models import Chunk vector_index = VectorIndex(embedding_model=shared_embedding_model) chunk = Chunk( chunk_id="doc1_chunk_0", doc_id="doc1", content="This section covers the basics of training neural networks.", metadata={"tags": [], "links": []}, chunk_index=0, header_path="Machine Learning > Deep Learning > Training", start_pos=0, end_pos=100, file_path="/tmp/ml.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk) # Search should find content via header context results = vector_index.search("deep learning training", top_k=5) assert "doc1_chunk_0" in _extract_chunk_ids(results) # Search for terms only in header_path results_header = vector_index.search("machine learning", top_k=5) assert "doc1_chunk_0" in _extract_chunk_ids(results_header) def test_vector_index_chunk_without_header_path_uses_content_only(shared_embedding_model): """ Chunks without header_path use content only for embedding. Ensures backward compatibility for chunks that don't have header paths. """ from src.models import Chunk vector_index = VectorIndex(embedding_model=shared_embedding_model) chunk = Chunk( chunk_id="doc2_chunk_0", doc_id="doc2", content="Python is a versatile programming language used for web development.", metadata={"tags": ["python"], "links": []}, chunk_index=0, header_path="", # Empty header path start_pos=0, end_pos=80, file_path="/tmp/python.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk) # Search for content terms results = vector_index.search("python programming language", top_k=5) assert "doc2_chunk_0" in _extract_chunk_ids(results) def test_vector_index_header_weighted_improves_relevance(shared_embedding_model): """ Header-weighted embeddings improve search relevance for related queries. Verifies that chunks with relevant headers rank higher than those without when searching for terms in the header path. """ from src.models import Chunk vector_index = VectorIndex(embedding_model=shared_embedding_model) # Chunk with relevant header chunk_with_header = Chunk( chunk_id="api_chunk_0", doc_id="api-docs", content="This function accepts parameters and returns a value.", metadata={}, chunk_index=0, header_path="API Reference > Authentication > Token Validation", start_pos=0, end_pos=60, file_path="/tmp/api.md", modified_time=datetime.now(), ) # Chunk without relevant header chunk_without_header = Chunk( chunk_id="general_chunk_0", doc_id="general-docs", content="This function accepts parameters and returns a value.", metadata={}, chunk_index=0, header_path="", # No header start_pos=0, end_pos=60, file_path="/tmp/general.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk_with_header) vector_index.add_chunk(chunk_without_header) # Search for authentication - should favor chunk with relevant header results = vector_index.search("authentication token validation", top_k=5) chunk_ids = _extract_chunk_ids(results) assert "api_chunk_0" in chunk_ids # api_chunk should rank higher due to header context if "general_chunk_0" in chunk_ids: assert chunk_ids.index("api_chunk_0") < chunk_ids.index("general_chunk_0") # ============================================================================ # Stale Reference Cleanup Tests (Self-Healing) # ============================================================================ def test_stale_chunk_ref_cleaned_on_lookup(shared_embedding_model): """ Test that looking up a stale chunk ID cleans it from mappings. When get_chunk_by_id() encounters a chunk ID that exists in mappings but not in the docstore, it should remove the stale reference. """ from src.models import Chunk vector_index = VectorIndex(embedding_model=shared_embedding_model) # Add a real chunk chunk = Chunk( chunk_id="real_chunk", doc_id="doc1", content="Real content that exists in docstore.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=50, file_path="/tmp/real.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk) # Manually inject a stale reference (ID in mappings but not docstore) stale_chunk_id = "stale_orphan_chunk" vector_index._chunk_id_to_node_id[stale_chunk_id] = stale_chunk_id vector_index._doc_id_to_node_ids["orphan_doc"] = [stale_chunk_id] # Verify stale ref exists in mappings assert stale_chunk_id in vector_index._chunk_id_to_node_id assert "orphan_doc" in vector_index._doc_id_to_node_ids # Look up the stale chunk - should trigger cleanup result = vector_index.get_chunk_by_id(stale_chunk_id) assert result is None # Verify stale ref was cleaned from mappings assert stale_chunk_id not in vector_index._chunk_id_to_node_id assert "orphan_doc" not in vector_index._doc_id_to_node_ids def test_stale_warning_only_logged_once(shared_embedding_model, caplog): """ Test that stale chunk warning is only logged once per chunk ID. The _warned_stale_chunk_ids set should prevent duplicate warnings from flooding logs when the same stale chunk is accessed repeatedly. """ import logging vector_index = VectorIndex(embedding_model=shared_embedding_model) vector_index._initialize_index() # Initialize so index is not None # Inject a stale reference stale_id = "stale_repeatedly_accessed" vector_index._chunk_id_to_node_id[stale_id] = stale_id # Clear any existing warnings vector_index._warned_stale_chunk_ids.clear() with caplog.at_level(logging.WARNING): # First lookup - should log warning vector_index.get_chunk_by_id(stale_id) # Count warnings for our stale ID first_warning_count = sum( 1 for record in caplog.records if stale_id in record.message and record.levelno == logging.WARNING ) assert first_warning_count == 1 # Re-add the stale ref (simulate it being re-added somehow) vector_index._chunk_id_to_node_id[stale_id] = stale_id # Second lookup - should NOT log another warning caplog.clear() vector_index.get_chunk_by_id(stale_id) second_warning_count = sum( 1 for record in caplog.records if stale_id in record.message and record.levelno == logging.WARNING ) assert second_warning_count == 0 # Verify the ID is in warned set assert stale_id in vector_index._warned_stale_chunk_ids def test_reconcile_mappings_removes_stale_refs(shared_embedding_model): """ Test that reconcile_mappings() batch-removes all stale references. The reconcile_mappings() method should scan all mappings and remove any chunk IDs that no longer exist in the docstore. """ from src.models import Chunk vector_index = VectorIndex(embedding_model=shared_embedding_model) # Add real chunks for i in range(3): chunk = Chunk( chunk_id=f"real_chunk_{i}", doc_id=f"doc_{i}", content=f"Real content number {i}.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=30, file_path=f"/tmp/real_{i}.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk) # Inject multiple stale references for i in range(5): stale_id = f"stale_batch_{i}" vector_index._chunk_id_to_node_id[stale_id] = stale_id vector_index._doc_id_to_node_ids[f"stale_doc_{i}"] = [stale_id] # Verify all stale refs exist assert len([k for k in vector_index._chunk_id_to_node_id if k.startswith("stale_")]) == 5 assert len([k for k in vector_index._doc_id_to_node_ids if k.startswith("stale_")]) == 5 # Run reconciliation removed_count = vector_index.reconcile_mappings() # Should have removed all 5 stale refs assert removed_count == 5 # Verify no stale refs remain assert len([k for k in vector_index._chunk_id_to_node_id if k.startswith("stale_")]) == 0 assert len([k for k in vector_index._doc_id_to_node_ids if k.startswith("stale_")]) == 0 # Real chunks should still be there assert len([k for k in vector_index._chunk_id_to_node_id if k.startswith("real_")]) == 3 def test_warned_set_cleared_on_load(shared_embedding_model, tmp_path): """ Test that _warned_stale_chunk_ids is cleared when loading index. After loading an index from disk, the warned set should be reset since the stale state may have changed during persistence. """ from src.models import Chunk vector_index = VectorIndex(embedding_model=shared_embedding_model) # Add a chunk chunk = Chunk( chunk_id="persistent_chunk", doc_id="doc1", content="Content to persist.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=30, file_path="/tmp/test.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk) # Simulate having warned about some stale chunks vector_index._warned_stale_chunk_ids["old_stale_1"] = True vector_index._warned_stale_chunk_ids["old_stale_2"] = True # Persist to disk persist_path = tmp_path / "vector_index" vector_index.persist(persist_path) # Create new index and load vector_index2 = VectorIndex(embedding_model=shared_embedding_model) # Add some warned IDs before load (simulating prior session) vector_index2._warned_stale_chunk_ids["pre_load_warning"] = True # Load the persisted index vector_index2.load(persist_path) # Warned set should be cleared assert len(vector_index2._warned_stale_chunk_ids) == 0 # Verify index is functional results = vector_index2.search("content persist", top_k=5) assert "persistent_chunk" in _extract_chunk_ids(results) def test_term_counts_and_vocabulary_loaded_as_ordereddict(shared_embedding_model, tmp_path): """ Test that _term_counts and _concept_vocabulary are loaded as OrderedDict. Regression test for bug where JSON loading returned plain dict, causing AttributeError when calling move_to_end() during indexing. """ from collections import OrderedDict from src.models import Chunk vector_index = VectorIndex(embedding_model=shared_embedding_model) # Add chunks to populate term counts and vocabulary for i in range(3): chunk = Chunk( chunk_id=f"chunk_{i}", doc_id=f"doc_{i}", content="Machine learning algorithms require training data for optimization.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=100, file_path=f"/tmp/doc_{i}.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk) # Register terms to populate _term_counts vector_index.register_document_terms("machine learning optimization algorithms") # Build vocabulary vector_index.build_concept_vocabulary(min_term_length=3, max_terms=100, min_frequency=1) # Verify both are OrderedDict before persist assert isinstance(vector_index._term_counts, OrderedDict) assert isinstance(vector_index._concept_vocabulary, OrderedDict) assert len(vector_index._term_counts) > 0 assert len(vector_index._concept_vocabulary) > 0 # Persist to disk persist_path = tmp_path / "vector_index_ordered" vector_index.persist(persist_path) # Load into new index vector_index2 = VectorIndex(embedding_model=shared_embedding_model) vector_index2.load(persist_path) # Verify both are still OrderedDict after load (this was the bug) assert isinstance(vector_index2._term_counts, OrderedDict) assert isinstance(vector_index2._concept_vocabulary, OrderedDict) # Verify move_to_end() works (this would fail with plain dict) if vector_index2._term_counts: first_term = next(iter(vector_index2._term_counts)) vector_index2._term_counts.move_to_end(first_term) # Should not raise AttributeError # Verify indexing still works after load (this triggered the original bug) chunk = Chunk( chunk_id="new_chunk_after_load", doc_id="new_doc", content="New content about neural networks and deep learning.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=60, file_path="/tmp/new.md", modified_time=datetime.now(), ) vector_index2.add_chunk(chunk) # Should not raise AttributeError # Verify search works results = vector_index2.search("neural networks", top_k=5) assert "new_chunk_after_load" in _extract_chunk_ids(results) def test_ordered_dict_preserved_after_persist_and_load_regression(shared_embedding_model, tmp_path): """ Regression test for bug where json.load() returned plain dict instead of OrderedDict, causing AttributeError: 'dict' object has no attribute 'move_to_end'. This simulates the create_memory flow: persist index → load index → add new chunk. The bug manifested when add_chunk called register_document_terms which called _term_counts.move_to_end() on a plain dict. """ from collections import OrderedDict from src.models import Chunk # Create index and add initial content vector_index = VectorIndex(embedding_model=shared_embedding_model) chunk1 = Chunk( chunk_id="chunk_1", doc_id="doc_1", content="Python programming language with asyncio and type hints.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=100, file_path="/tmp/doc_1.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk1) # Populate term counts and vocabulary vector_index.register_document_terms("python asyncio programming") vector_index.build_concept_vocabulary(min_term_length=3, max_terms=100, min_frequency=1) # Verify OrderedDict before persist assert isinstance(vector_index._term_counts, OrderedDict), \ "term_counts should be OrderedDict before persist" assert isinstance(vector_index._concept_vocabulary, OrderedDict), \ "concept_vocabulary should be OrderedDict before persist" # Persist to disk persist_path = tmp_path / "vector_ordered_dict_test" vector_index.persist(persist_path) # Create new index and load from disk (simulates memory system startup) vector_index2 = VectorIndex(embedding_model=shared_embedding_model) vector_index2.load(persist_path) # CRITICAL: Verify both are OrderedDict after load (this was the bug) assert isinstance(vector_index2._term_counts, OrderedDict), \ "term_counts should be OrderedDict after load (was plain dict, causing move_to_end AttributeError)" assert isinstance(vector_index2._concept_vocabulary, OrderedDict), \ "concept_vocabulary should be OrderedDict after load" # Simulate create_memory flow: add new chunk after loading (this would fail with plain dict) chunk2 = Chunk( chunk_id="chunk_2", doc_id="doc_2", content="FastAPI web framework with dependency injection and Pydantic models.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=100, file_path="/tmp/doc_2.md", modified_time=datetime.now(), ) # This should NOT raise AttributeError: 'dict' object has no attribute 'move_to_end' vector_index2.add_chunk(chunk2) # Verify the chunk was indexed successfully results = vector_index2.search("FastAPI web framework", top_k=5) assert "chunk_2" in _extract_chunk_ids(results) # Verify move_to_end() works directly (additional safety check) if vector_index2._term_counts: first_term = next(iter(vector_index2._term_counts)) vector_index2._term_counts.move_to_end(first_term) # Should not raise # ============================================================================ # Chunk Removal Tests (Phase 2 Delta Indexing) # ============================================================================ def test_remove_chunk_removes_from_all_mappings(vector_index, shared_embedding_model): """Test that remove_chunk() removes chunk from index and all mappings.""" from src.models import Chunk # Add chunks chunk1 = Chunk( chunk_id="doc1#chunk#0", doc_id="doc1", content="First chunk content.", metadata={}, chunk_index=0, header_path="Section 1", start_pos=0, end_pos=20, file_path="/tmp/doc1.md", modified_time=datetime.now(), ) chunk2 = Chunk( chunk_id="doc1#chunk#1", doc_id="doc1", content="Second chunk content.", metadata={}, chunk_index=1, header_path="Section 2", start_pos=21, end_pos=42, file_path="/tmp/doc1.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk1) vector_index.add_chunk(chunk2) # Verify chunks are indexed assert "doc1#chunk#0" in vector_index._chunk_id_to_node_id assert "doc1#chunk#1" in vector_index._chunk_id_to_node_id assert "doc1" in vector_index._doc_id_to_node_ids assert len(vector_index._doc_id_to_node_ids["doc1"]) == 2 # Remove one chunk vector_index.remove_chunk("doc1#chunk#0") # Verify removal assert "doc1#chunk#0" not in vector_index._chunk_id_to_node_id assert "doc1#chunk#1" in vector_index._chunk_id_to_node_id assert "doc1" in vector_index._doc_id_to_node_ids assert len(vector_index._doc_id_to_node_ids["doc1"]) == 1 assert "doc1#chunk#1" in vector_index._doc_id_to_node_ids["doc1"] def test_remove_chunk_handles_missing_chunk(vector_index): """Test that remove_chunk() handles missing chunk gracefully.""" # Should not raise exception vector_index.remove_chunk("nonexistent_chunk_id") def test_remove_chunk_removes_last_chunk_cleans_doc_mapping(vector_index): """Test that removing last chunk of a doc removes the doc from mappings.""" from src.models import Chunk chunk = Chunk( chunk_id="doc_single#chunk#0", doc_id="doc_single", content="Only chunk.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=11, file_path="/tmp/single.md", modified_time=datetime.now(), ) vector_index.add_chunk(chunk) assert "doc_single" in vector_index._doc_id_to_node_ids # Remove the only chunk vector_index.remove_chunk("doc_single#chunk#0") # Doc mapping should be removed when last chunk is removed assert "doc_single" not in vector_index._doc_id_to_node_ids def test_remove_chunk_thread_safe(vector_index): """Test that remove_chunk() is thread-safe with concurrent operations.""" from concurrent.futures import ThreadPoolExecutor from src.models import Chunk # Add multiple chunks chunks = [] for i in range(10): chunk = Chunk( chunk_id=f"concurrent_doc#chunk#{i}", doc_id="concurrent_doc", content=f"Chunk {i} content.", metadata={}, chunk_index=i, header_path=f"Section {i}", start_pos=i * 20, end_pos=(i + 1) * 20, file_path="/tmp/concurrent.md", modified_time=datetime.now(), ) chunks.append(chunk) vector_index.add_chunk(chunk) # Concurrently remove half the chunks def remove_chunk_task(chunk_id): vector_index.remove_chunk(chunk_id) with ThreadPoolExecutor(max_workers=4) as executor: chunk_ids_to_remove = [f"concurrent_doc#chunk#{i}" for i in range(0, 10, 2)] list(executor.map(remove_chunk_task, chunk_ids_to_remove)) # Verify correct chunks were removed for i in range(10): chunk_id = f"concurrent_doc#chunk#{i}" if i % 2 == 0: assert chunk_id not in vector_index._chunk_id_to_node_id else: assert chunk_id in vector_index._chunk_id_to_node_id def test_remove_chunk_before_initialization(vector_index): """Test remove_chunk() when index not initialized.""" # Clear index vector_index._index = None # Should log warning but not raise vector_index.remove_chunk("any_chunk_id")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_vector_index.py•27.2 KiB