Markdown RAG Documentation

Overview Schema Related Servers Score Discussions

test_keyword_index.py•31.4 KiB

from datetime import datetime import pytest from src.indices.keyword import KeywordIndex from src.models import Document def _extract_chunk_ids(results: list) -> list[str]: if not results: return [] if isinstance(results[0], str): return results return [r["chunk_id"] for r in results] @pytest.fixture def sample_document(): return Document( id="test-doc", content="Machine learning is a subset of artificial intelligence.", metadata={"title": "ML Intro", "aliases": ["AI Intro", "ML Basics"]}, links=["AI"], tags=["ml", "ai"], file_path="/tmp/test.md", modified_time=datetime.now(), ) @pytest.fixture def keyword_index(): return KeywordIndex() def test_keyword_index_add_and_search(keyword_index, sample_document): keyword_index.add(sample_document) results = keyword_index.search("machine learning", top_k=5) assert "test-doc" in _extract_chunk_ids(results) assert len(results) <= 5 def test_keyword_index_search_aliases(keyword_index, sample_document): keyword_index.add(sample_document) results = keyword_index.search("AI Basics", top_k=5) assert "test-doc" in _extract_chunk_ids(results) def test_keyword_index_search_tags(keyword_index, sample_document): keyword_index.add(sample_document) results = keyword_index.search("ml", top_k=5) assert "test-doc" in _extract_chunk_ids(results) def test_keyword_index_remove(keyword_index, sample_document): keyword_index.add(sample_document) results_before = keyword_index.search("machine learning", top_k=5) assert "test-doc" in _extract_chunk_ids(results_before) keyword_index.remove("test-doc") results_after = keyword_index.search("machine learning", top_k=5) assert "test-doc" not in _extract_chunk_ids(results_after) def test_keyword_index_empty_query(keyword_index, sample_document): keyword_index.add(sample_document) results = keyword_index.search("", top_k=5) assert results == [] results = keyword_index.search(" ", top_k=5) assert results == [] def test_keyword_index_persist_and_load(tmp_path, sample_document): index1 = KeywordIndex() index1.add(sample_document) persist_path = tmp_path / "index" index1.persist(persist_path) assert persist_path.exists() index2 = KeywordIndex() index2.load(persist_path) results = index2.search("machine learning", top_k=5) assert "test-doc" in _extract_chunk_ids(results) def test_keyword_index_multiple_documents(keyword_index): doc1 = Document( id="doc1", content="Python is a programming language.", metadata={}, links=[], tags=["python"], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) doc2 = Document( id="doc2", content="JavaScript is used for web development.", metadata={}, links=[], tags=["javascript"], file_path="/tmp/doc2.md", modified_time=datetime.now(), ) keyword_index.add(doc1) keyword_index.add(doc2) results = keyword_index.search("python", top_k=5) assert "doc1" in _extract_chunk_ids(results) results = keyword_index.search("javascript", top_k=5) assert "doc2" in _extract_chunk_ids(results) def test_keyword_index_exact_match_priority(keyword_index): doc1 = Document( id="doc1", content="BM25 is a ranking function used in information retrieval.", metadata={}, links=[], tags=[], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) doc2 = Document( id="doc2", content="Information retrieval is important for search engines.", metadata={}, links=[], tags=[], file_path="/tmp/doc2.md", modified_time=datetime.now(), ) keyword_index.add(doc1) keyword_index.add(doc2) results = keyword_index.search("BM25", top_k=5) assert "doc1" in _extract_chunk_ids(results) assert _extract_chunk_ids(results).index("doc1") < _extract_chunk_ids(results).index("doc2") if "doc2" in results else True def test_keyword_index_update_document(keyword_index): doc = Document( id="doc1", content="Original content about Python.", metadata={}, links=[], tags=["python"], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("python", top_k=5) assert "doc1" in _extract_chunk_ids(results) updated_doc = Document( id="doc1", content="Updated content about JavaScript.", metadata={}, links=[], tags=["javascript"], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) keyword_index.add(updated_doc) results = keyword_index.search("javascript", top_k=5) assert "doc1" in _extract_chunk_ids(results) results = keyword_index.search("python", top_k=5) assert "doc1" not in _extract_chunk_ids(results) def test_keyword_index_load_nonexistent_path(tmp_path): index = KeywordIndex() nonexistent_path = tmp_path / "nonexistent" index.load(nonexistent_path) # Verify index is functional by adding and searching a document doc = Document( id="test-doc", content="Test content for initialization.", metadata={}, links=[], tags=[], file_path="/tmp/test.md", modified_time=datetime.now(), ) index.add(doc) results = index.search("test content", top_k=5) assert "test-doc" in _extract_chunk_ids(results) @pytest.mark.skip( reason="Whoosh tokenization normalizes 'C++' to 'c', making exact match impossible. " "This is inherent to Whoosh's StandardAnalyzer which strips punctuation. " "Would require custom analyzer configuration to preserve such tokens." ) def test_keyword_index_special_characters(keyword_index): doc = Document( id="special-doc", content="C++ is a programming language. Node.js is a runtime.", metadata={}, links=[], tags=["c++", "nodejs"], file_path="/tmp/special.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("C++", top_k=5) assert "special-doc" in _extract_chunk_ids(results) results = keyword_index.search("Node.js", top_k=5) assert "special-doc" in _extract_chunk_ids(results) def test_keyword_index_phrase_search(keyword_index): doc1 = Document( id="doc1", content="The quick brown fox jumps over the lazy dog.", metadata={}, links=[], tags=[], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) doc2 = Document( id="doc2", content="A lazy fox and a quick dog.", metadata={}, links=[], tags=[], file_path="/tmp/doc2.md", modified_time=datetime.now(), ) keyword_index.add(doc1) keyword_index.add(doc2) results = keyword_index.search("quick brown fox", top_k=5) assert "doc1" in _extract_chunk_ids(results) def test_keyword_index_no_results(keyword_index, sample_document): keyword_index.add(sample_document) results = keyword_index.search("quantum physics", top_k=5) assert results == [] def test_keyword_index_aliases_as_string(keyword_index): doc = Document( id="doc1", content="Content about AI.", metadata={"aliases": "Artificial Intelligence"}, links=[], tags=[], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("Artificial Intelligence", top_k=5) assert "doc1" in _extract_chunk_ids(results) def test_keyword_index_no_aliases(keyword_index): doc = Document( id="doc1", content="Content without aliases.", metadata={}, links=[], tags=[], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("content", top_k=5) assert "doc1" in _extract_chunk_ids(results) def test_keyword_index_concurrent_access(keyword_index): import threading doc1 = Document( id="doc1", content="First document.", metadata={}, links=[], tags=[], file_path="/tmp/doc1.md", modified_time=datetime.now(), ) doc2 = Document( id="doc2", content="Second document.", metadata={}, links=[], tags=[], file_path="/tmp/doc2.md", modified_time=datetime.now(), ) def add_doc1(): keyword_index.add(doc1) def add_doc2(): keyword_index.add(doc2) thread1 = threading.Thread(target=add_doc1) thread2 = threading.Thread(target=add_doc2) thread1.start() thread2.start() thread1.join() thread2.join() results = keyword_index.search("document", top_k=5) assert "doc1" in _extract_chunk_ids(results) assert "doc2" in _extract_chunk_ids(results) def test_keyword_index_empty_content(keyword_index): """ Validates graceful handling of documents with empty content. Prevents indexing crashes on placeholder or metadata-only files. """ doc = Document( id="empty-doc", content="", metadata={"title": "Empty File"}, links=[], tags=["empty"], file_path="/tmp/empty.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("empty", top_k=5) assert "empty-doc" in _extract_chunk_ids(results) def test_keyword_index_very_large_document(keyword_index): """ Tests indexing of very large documents (>10k characters). Ensures Whoosh can handle large content without performance degradation or errors. """ large_content = " ".join( [f"This is sentence number {i} in a very long document." for i in range(200)] ) assert len(large_content) > 10000 doc = Document( id="large-doc", content=large_content, metadata={}, links=[], tags=["large"], file_path="/tmp/large.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("sentence number 42", top_k=5) assert "large-doc" in _extract_chunk_ids(results) results = keyword_index.search("large", top_k=5) assert "large-doc" in _extract_chunk_ids(results) # ============================================================================ # BM25F Field Boosting Tests (Phase 1 Search Quality) # ============================================================================ def test_keyword_index_title_field_boosted(): """ Title field is indexed with boost factor 3.0. Verifies P4: Title field has highest boost and matches rank higher. """ from src.models import Chunk keyword_index = KeywordIndex() # Chunk with search term in title chunk_with_title = Chunk( chunk_id="titled_chunk_0", doc_id="titled-doc", content="Some generic content about programming.", metadata={"title": "Authentication Guide", "tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=50, file_path="/tmp/auth.md", modified_time=datetime.now(), ) # Chunk with search term in content only chunk_content_only = Chunk( chunk_id="content_chunk_0", doc_id="content-doc", content="This document covers authentication patterns and best practices.", metadata={"title": "Generic Document", "tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=70, file_path="/tmp/generic.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk_with_title) keyword_index.add_chunk(chunk_content_only) results = keyword_index.search("authentication", top_k=5) chunk_ids = _extract_chunk_ids(results) assert "titled_chunk_0" in chunk_ids assert "content_chunk_0" in chunk_ids # Title match should rank higher due to 3.0 boost assert chunk_ids.index("titled_chunk_0") < chunk_ids.index("content_chunk_0") def test_keyword_index_headers_field_indexed(): """ Headers field is indexed with boost factor 2.5. Verifies header_path is searchable in keyword index. """ from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="header_chunk_0", doc_id="header-doc", content="Implementation details for the feature.", metadata={"tags": []}, chunk_index=0, header_path="API Reference > Endpoints > User Management", start_pos=0, end_pos=50, file_path="/tmp/api.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Search for terms in header_path results = keyword_index.search("API endpoints", top_k=5) assert "header_chunk_0" in _extract_chunk_ids(results) results = keyword_index.search("user management", top_k=5) assert "header_chunk_0" in _extract_chunk_ids(results) def test_keyword_index_keywords_field_indexed(): """ Keywords field is indexed with boost factor 2.5. Verifies frontmatter keywords are searchable. """ from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="kw_chunk_0", doc_id="kw-doc", content="General content without specific terms.", metadata={ "keywords": ["microservices", "distributed-systems", "scalability"], "tags": [], }, chunk_index=0, header_path="", start_pos=0, end_pos=50, file_path="/tmp/arch.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Search for keywords results = keyword_index.search("microservices", top_k=5) assert "kw_chunk_0" in _extract_chunk_ids(results) results = keyword_index.search("distributed systems", top_k=5) assert "kw_chunk_0" in _extract_chunk_ids(results) def test_keyword_index_description_field_indexed(): """ Description field is indexed with boost factor 2.0. Verifies frontmatter description is searchable. """ from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="desc_chunk_0", doc_id="desc-doc", content="Code examples and snippets.", metadata={ "description": "A comprehensive guide to containerization with Docker", "tags": [], }, chunk_index=0, header_path="", start_pos=0, end_pos=30, file_path="/tmp/docker.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Search for terms in description results = keyword_index.search("containerization Docker", top_k=5) assert "desc_chunk_0" in _extract_chunk_ids(results) def test_keyword_index_author_field_indexed(): """ Author field is indexed for searchability. Verifies documents can be found by author name. """ from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="author_chunk_0", doc_id="author-doc", content="Technical documentation content.", metadata={"author": "John Smith", "tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=40, file_path="/tmp/authored.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Search by author results = keyword_index.search("John Smith", top_k=5) assert "author_chunk_0" in _extract_chunk_ids(results) def test_keyword_index_category_field_indexed(): """ Category field is indexed as KEYWORD type. Verifies documents can be filtered/searched by category. """ from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="cat_chunk_0", doc_id="cat-doc", content="Tutorial content here.", metadata={"category": "tutorials", "tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=25, file_path="/tmp/tutorial.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Category is a KEYWORD field, exact match search results = keyword_index.search("tutorials", top_k=5) assert "cat_chunk_0" in _extract_chunk_ids(results) def test_keyword_index_all_boosted_fields_together(): """ All boosted fields work together for comprehensive search. Verifies multiple frontmatter fields are indexed and searchable. """ from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="full_chunk_0", doc_id="full-doc", content="Main content of the document.", metadata={ "title": "Kubernetes Deployment Guide", "description": "Step-by-step instructions for deploying applications", "keywords": ["k8s", "containers", "orchestration"], "author": "DevOps Team", "category": "infrastructure", "aliases": ["k8s-guide", "deployment-howto"], "tags": ["kubernetes", "devops"], }, chunk_index=0, header_path="Getting Started > Prerequisites", start_pos=0, end_pos=35, file_path="/tmp/k8s.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Search various fields assert "full_chunk_0" in _extract_chunk_ids(keyword_index.search("Kubernetes", top_k=5)) assert "full_chunk_0" in _extract_chunk_ids(keyword_index.search("deploying applications", top_k=5)) assert "full_chunk_0" in _extract_chunk_ids(keyword_index.search("k8s containers", top_k=5)) assert "full_chunk_0" in _extract_chunk_ids(keyword_index.search("DevOps Team", top_k=5)) assert "full_chunk_0" in _extract_chunk_ids(keyword_index.search("prerequisites", top_k=5)) assert "full_chunk_0" in _extract_chunk_ids(keyword_index.search("k8s-guide", top_k=5)) def test_keyword_index_schema_mismatch_triggers_rebuild(tmp_path): """ Loading an index with mismatched schema triggers a rebuild. When the persisted index has a different schema than expected (e.g., missing new fields), the index should be rebuilt from scratch to avoid field errors. """ from whoosh import index as whoosh_index from whoosh.fields import ID, TEXT, Schema old_schema = Schema( id=ID(stored=True, unique=True), doc_id=ID(stored=True), content=TEXT(stored=False), aliases=TEXT(stored=False), tags=TEXT(stored=False), ) index_path = tmp_path / "old_keyword_index" index_path.mkdir() whoosh_index.create_in(str(index_path), old_schema) keyword_index = KeywordIndex() keyword_index.load(index_path) from src.models import Chunk chunk = Chunk( chunk_id="new_chunk_0", doc_id="new-doc", content="Test content.", metadata={"author": "Test Author", "tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=13, file_path="/tmp/test.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) results = keyword_index.search("Test Author", top_k=5) assert "new_chunk_0" in _extract_chunk_ids(results) def test_keyword_index_remove_handles_corrupted_segment(tmp_path): """ Remove operation handles corrupted segment files gracefully. When Whoosh segment files (.seg) are deleted/corrupted mid-operation, the index should detect the corruption, reinitialize, and not crash. """ import glob from pathlib import Path from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="chunk_to_remove_0", doc_id="test-doc", content="Content for removal testing.", metadata={"tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=30, file_path="/tmp/test.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) index_path = tmp_path / "corrupted_keyword_index" keyword_index.persist(index_path) keyword_index.load(index_path) seg_files = glob.glob(str(index_path / "*.seg")) assert len(seg_files) > 0, "Expected segment files after persist" for seg in seg_files: Path(seg).unlink() keyword_index.remove("chunk_to_remove_0") results = keyword_index.search("removal testing", top_k=5) assert isinstance(results, list) def test_keyword_index_search_handles_corrupted_segment(tmp_path): """ Search operation handles corrupted segment files gracefully. When Whoosh segment files (.seg) are corrupted, search should detect the issue, reinitialize the index, and return an empty list rather than crashing. """ import glob from pathlib import Path from src.models import Chunk keyword_index = KeywordIndex() chunk = Chunk( chunk_id="search_chunk_0", doc_id="search-doc", content="Searchable content for testing.", metadata={"tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=35, file_path="/tmp/test.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) index_path = tmp_path / "corrupted_search_index" keyword_index.persist(index_path) keyword_index.load(index_path) seg_files = glob.glob(str(index_path / "*.seg")) assert len(seg_files) > 0, "Expected segment files after persist" for seg in seg_files: Path(seg).unlink() results = keyword_index.search("searchable content", top_k=5) assert results == [] def test_keyword_index_recovery_allows_reindexing(tmp_path): """ After corruption recovery, new documents can be indexed successfully. This tests the full cycle: create index, persist, corrupt, detect corruption during operation, reinitialize, then add new documents successfully. """ import glob from pathlib import Path from src.models import Chunk keyword_index = KeywordIndex() original_chunk = Chunk( chunk_id="original_0", doc_id="original-doc", content="Original content before corruption.", metadata={"tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=40, file_path="/tmp/original.md", modified_time=datetime.now(), ) keyword_index.add_chunk(original_chunk) index_path = tmp_path / "recovery_test_index" keyword_index.persist(index_path) keyword_index.load(index_path) seg_files = glob.glob(str(index_path / "*.seg")) for seg in seg_files: Path(seg).unlink() keyword_index.search("trigger corruption detection", top_k=5) new_chunk = Chunk( chunk_id="new_after_recovery_0", doc_id="new-doc", content="New content added after recovery.", metadata={"tags": []}, chunk_index=0, header_path="", start_pos=0, end_pos=35, file_path="/tmp/new.md", modified_time=datetime.now(), ) keyword_index.add_chunk(new_chunk) results = keyword_index.search("new content recovery", top_k=5) assert "new_after_recovery_0" in _extract_chunk_ids(results) # ============================================================================ # Chunk Removal Tests (Phase 2 Delta Indexing) # ============================================================================ def test_remove_chunk_removes_from_index(): """Test that remove_chunk() removes specific chunk from keyword index.""" from src.models import Chunk keyword_index = KeywordIndex() # Add two chunks chunk1 = Chunk( chunk_id="doc1#chunk#0", doc_id="doc1", content="First chunk with Python programming.", metadata={}, chunk_index=0, header_path="Section 1", start_pos=0, end_pos=36, file_path="/tmp/doc1.md", modified_time=datetime.now(), ) chunk2 = Chunk( chunk_id="doc1#chunk#1", doc_id="doc1", content="Second chunk with Java programming.", metadata={}, chunk_index=1, header_path="Section 2", start_pos=37, end_pos=72, file_path="/tmp/doc1.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk1) keyword_index.add_chunk(chunk2) # Both chunks should be searchable results = keyword_index.search("Python", top_k=5) assert "doc1#chunk#0" in _extract_chunk_ids(results) results = keyword_index.search("Java", top_k=5) assert "doc1#chunk#1" in _extract_chunk_ids(results) # Remove first chunk keyword_index.remove_chunk("doc1#chunk#0") # First chunk should not be found results = keyword_index.search("Python", top_k=5) assert "doc1#chunk#0" not in _extract_chunk_ids(results) # Second chunk should still be found results = keyword_index.search("Java", top_k=5) assert "doc1#chunk#1" in _extract_chunk_ids(results) def test_remove_chunk_handles_missing_chunk(): """Test that remove_chunk() handles missing chunk gracefully.""" keyword_index = KeywordIndex() # Should not raise exception keyword_index.remove_chunk("nonexistent_chunk_id") def test_remove_chunk_handles_corruption(tmp_path): """Test that remove_chunk() handles index corruption gracefully.""" from src.models import Chunk import glob from pathlib import Path keyword_index = KeywordIndex() chunk = Chunk( chunk_id="corrupt_test#chunk#0", doc_id="corrupt_test", content="Content for corruption test.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=29, file_path="/tmp/corrupt.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Persist and corrupt persist_path = tmp_path / "corrupt_index" keyword_index.persist(persist_path) keyword_index.load(persist_path) # Delete segment files to simulate corruption seg_files = glob.glob(str(persist_path / "*.seg")) for seg in seg_files: Path(seg).unlink() # Should handle corruption gracefully keyword_index.remove_chunk("corrupt_test#chunk#0") # Index should be reinitialized and functional new_chunk = Chunk( chunk_id="new_chunk#0", doc_id="new_doc", content="New content after recovery.", metadata={}, chunk_index=0, header_path="", start_pos=0, end_pos=27, file_path="/tmp/new.md", modified_time=datetime.now(), ) keyword_index.add_chunk(new_chunk) results = keyword_index.search("new content", top_k=5) assert "new_chunk#0" in _extract_chunk_ids(results) def test_remove_chunk_before_initialization(): """Test remove_chunk() when index not initialized.""" keyword_index = KeywordIndex() keyword_index._index = None # Should log warning but not raise keyword_index.remove_chunk("any_chunk_id") def test_remove_chunk_thread_safe(): """Test that remove_chunk() is thread-safe with concurrent operations.""" from concurrent.futures import ThreadPoolExecutor from src.models import Chunk keyword_index = KeywordIndex() # Add multiple chunks for i in range(10): chunk = Chunk( chunk_id=f"concurrent#chunk#{i}", doc_id="concurrent", content=f"Chunk {i} with unique term{i}.", metadata={}, chunk_index=i, header_path=f"Section {i}", start_pos=i * 30, end_pos=(i + 1) * 30, file_path="/tmp/concurrent.md", modified_time=datetime.now(), ) keyword_index.add_chunk(chunk) # Concurrently remove half the chunks def remove_chunk_task(chunk_id): keyword_index.remove_chunk(chunk_id) with ThreadPoolExecutor(max_workers=4) as executor: chunk_ids_to_remove = [f"concurrent#chunk#{i}" for i in range(0, 10, 2)] list(executor.map(remove_chunk_task, chunk_ids_to_remove)) # Verify removed chunks are not found for i in range(0, 10, 2): results = keyword_index.search(f"term{i}", top_k=5) assert f"concurrent#chunk#{i}" not in _extract_chunk_ids(results) # Verify remaining chunks are still found for i in range(1, 10, 2): results = keyword_index.search(f"term{i}", top_k=5) assert f"concurrent#chunk#{i}" in _extract_chunk_ids(results) # ============================================================================ # Missing MAIN Index Regression Tests (Issue: whoosh.index.IndexError) # ============================================================================ def test_load_handles_missing_main_index(tmp_path): """ load() gracefully handles directory with missing MAIN index segment. When the index directory exists but lacks valid index files (e.g., only contains partial files or is empty), whoosh raises IndexError with message "Index 'MAIN' does not exist". The index should reinitialize rather than crash. """ index_dir = tmp_path / "incomplete_index" index_dir.mkdir() # Create an incomplete index structure (directory exists but no MAIN segment) (index_dir / "WRITELOCK").touch() keyword_index = KeywordIndex() keyword_index.load(index_dir) # Should have reinitialized - verify by adding and searching doc = Document( id="test-doc", content="Test content after recovery from missing MAIN.", metadata={}, links=[], tags=[], file_path="/tmp/test.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("test content", top_k=5) assert "test-doc" in _extract_chunk_ids(results) def test_load_from_handles_missing_main_index(tmp_path): """ load_from() returns False for directory with missing MAIN index segment. This tests the snapshot loading path where a directory exists but lacks valid index files. Should return False to signal snapshot is unusable. """ index_dir = tmp_path / "incomplete_snapshot" index_dir.mkdir() # Create an incomplete index structure (index_dir / "WRITELOCK").touch() keyword_index = KeywordIndex() result = keyword_index.load_from(index_dir) assert result is False def test_load_handles_empty_directory(tmp_path): """ load() handles completely empty directory that passes exists() check. Edge case where directory exists but is completely empty - no index files at all. """ empty_dir = tmp_path / "empty_index" empty_dir.mkdir() keyword_index = KeywordIndex() keyword_index.load(empty_dir) # Should work normally after reinitialization doc = Document( id="empty-recovery", content="Content indexed after empty directory recovery.", metadata={}, links=[], tags=[], file_path="/tmp/test.md", modified_time=datetime.now(), ) keyword_index.add(doc) results = keyword_index.search("empty directory recovery", top_k=5) assert "empty-recovery" in _extract_chunk_ids(results) def test_load_from_handles_empty_directory(tmp_path): """ load_from() returns False for completely empty directory. """ empty_dir = tmp_path / "empty_snapshot" empty_dir.mkdir() keyword_index = KeywordIndex() result = keyword_index.load_from(empty_dir) assert result is False

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/andnp/ragdocs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_keyword_index.py•31.4 KiB