Skip to main content
Glama
juanqui
by juanqui
test_text_index.py12.1 kB
"""Unit tests for TextIndex class.""" import asyncio import tempfile from pathlib import Path from unittest.mock import MagicMock import pytest from whoosh import index from pdfkb.config import ServerConfig from pdfkb.models import Chunk, Document from pdfkb.text_index import TextIndex @pytest.fixture def temp_dir(): """Create a temporary directory for test indexes.""" with tempfile.TemporaryDirectory() as tmpdir: yield Path(tmpdir) @pytest.fixture def test_config(temp_dir): """Create a test configuration.""" config = MagicMock(spec=ServerConfig) config.whoosh_index_dir = str(temp_dir / "whoosh") config.whoosh_analyzer = "standard" config.whoosh_min_score = 0.0 config.enable_hybrid_search = True return config @pytest.fixture def text_index(test_config): """Create a TextIndex instance.""" return TextIndex(test_config) @pytest.fixture def sample_document(): """Create a sample document with chunks.""" doc = Document(id="doc_test123", path="/test/document.pdf", title="Test Document", checksum="abc123") chunks = [ Chunk( id="chunk_001", document_id=doc.id, text="This is the first chunk about machine learning and AI.", page_number=1, chunk_index=0, ), Chunk( id="chunk_002", document_id=doc.id, text="The second chunk discusses neural networks and deep learning.", page_number=1, chunk_index=1, ), Chunk( id="chunk_003", document_id=doc.id, text="Python is a great language for data science.", page_number=2, chunk_index=2, ), ] doc.chunks = chunks return doc @pytest.mark.asyncio @pytest.mark.unit class TestTextIndex: """Test suite for TextIndex class.""" async def test_initialization(self, text_index, temp_dir): """Test TextIndex initialization.""" await text_index.initialize() # Check that index was created assert text_index.index is not None assert Path(text_index.index_dir).exists() assert index.exists_in(str(text_index.index_dir)) async def test_schema_creation(self, text_index): """Test schema creation with correct fields.""" schema = text_index._create_schema() # Check schema fields assert "chunk_id" in schema assert "document_id" in schema assert "text" in schema assert "metadata" in schema assert "page_number" in schema assert "chunk_index" in schema async def test_add_document(self, text_index, sample_document): """Test adding a document to the index.""" await text_index.initialize() await text_index.add_document(sample_document) # Verify chunks were added with text_index.index.searcher() as searcher: assert searcher.doc_count_all() == 3 # Check specific chunk doc = searcher.document(chunk_id="chunk_001") assert doc is not None assert doc["document_id"] == sample_document.id assert "machine learning" in doc["text"] async def test_add_document_empty_chunks(self, text_index): """Test adding a document with no chunks.""" doc = Document(id="empty_doc", path="/test/empty.pdf", checksum="xyz") await text_index.initialize() await text_index.add_document(doc) # Should handle gracefully with text_index.index.searcher() as searcher: assert searcher.doc_count_all() == 0 async def test_search_basic(self, text_index, sample_document): """Test basic search functionality.""" await text_index.initialize() await text_index.add_document(sample_document) # Search for a term results = await text_index.search("machine learning", limit=5) assert len(results) > 0 assert results[0]["chunk_id"] == "chunk_001" assert "machine learning" in results[0]["text"].lower() assert results[0]["score"] > 0 async def test_search_multiple_results(self, text_index, sample_document): """Test search returning multiple results.""" await text_index.initialize() await text_index.add_document(sample_document) # Search for a common term results = await text_index.search("learning", limit=10) # Should find at least 2 chunks assert len(results) >= 2 # Results should be sorted by score for i in range(1, len(results)): assert results[i - 1]["score"] >= results[i]["score"] async def test_search_no_results(self, text_index, sample_document): """Test search with no matching results.""" await text_index.initialize() await text_index.add_document(sample_document) # Search for non-existent term results = await text_index.search("quantum computing blockchain", limit=5) assert len(results) == 0 async def test_search_with_limit(self, text_index, sample_document): """Test search respects limit parameter.""" await text_index.initialize() await text_index.add_document(sample_document) # Add more documents for i in range(5): doc = Document(id=f"doc_{i}", path=f"/test/doc_{i}.pdf", checksum=f"hash_{i}") doc.chunks = [ Chunk( id=f"chunk_{i}_{j}", document_id=doc.id, text=f"This is about learning and education in document {i}", page_number=1, chunk_index=j, ) for j in range(3) ] await text_index.add_document(doc) # Search with limit results = await text_index.search("learning", limit=3) assert len(results) <= 3 async def test_delete_document(self, text_index, sample_document): """Test deleting a document from the index.""" await text_index.initialize() await text_index.add_document(sample_document) # Verify document exists with text_index.index.searcher() as searcher: # Count actual documents (not deleted) initial_docs = list(searcher.documents(document_id=sample_document.id)) assert len(initial_docs) == 3 # Delete document await text_index.delete_document(sample_document.id) # Verify deletion - check that documents are no longer returned with text_index.index.searcher() as searcher: # Check no documents with this ID are returned remaining_docs = list(searcher.documents(document_id=sample_document.id)) assert len(remaining_docs) == 0 # Check specific chunk is gone doc = searcher.document(chunk_id="chunk_001") assert doc is None async def test_get_document_count(self, text_index): """Test counting unique documents.""" await text_index.initialize() # Add multiple documents for i in range(3): doc = Document(id=f"doc_{i}", path=f"/test/doc_{i}.pdf", checksum=f"hash_{i}") doc.chunks = [ Chunk(id=f"chunk_{i}_{j}", document_id=doc.id, text=f"Content {j}", chunk_index=j) for j in range(2) ] await text_index.add_document(doc) count = await text_index.get_document_count() assert count == 3 async def test_get_chunk_count(self, text_index): """Test counting total chunks.""" await text_index.initialize() # Add documents with multiple chunks for i in range(2): doc = Document(id=f"doc_{i}", path=f"/test/doc_{i}.pdf", checksum=f"hash_{i}") doc.chunks = [ Chunk(id=f"chunk_{i}_{j}", document_id=doc.id, text=f"Content {j}", chunk_index=j) for j in range(3) ] await text_index.add_document(doc) count = await text_index.get_chunk_count() assert count == 6 # 2 documents * 3 chunks each async def test_reset_index(self, text_index, sample_document): """Test resetting the index.""" await text_index.initialize() await text_index.add_document(sample_document) # Verify data exists with text_index.index.searcher() as searcher: assert searcher.doc_count_all() > 0 # Reset index await text_index.reset_index() # Verify index is empty with text_index.index.searcher() as searcher: assert searcher.doc_count_all() == 0 async def test_update_existing_chunk(self, text_index, sample_document): """Test updating an existing chunk.""" await text_index.initialize() await text_index.add_document(sample_document) # Modify and re-add the same document sample_document.chunks[0].text = "Updated text for the first chunk" await text_index.add_document(sample_document) # Verify update with text_index.index.searcher() as searcher: # Count documents for this document_id (should still be 3) docs = list(searcher.documents(document_id=sample_document.id)) assert len(docs) == 3 # Check updated content doc = searcher.document(chunk_id="chunk_001") assert doc is not None assert "Updated text" in doc["text"] async def test_concurrent_writes(self, text_index): """Test concurrent write operations are properly synchronized.""" await text_index.initialize() # Create multiple documents docs = [] for i in range(5): doc = Document(id=f"doc_{i}", path=f"/test/doc_{i}.pdf", checksum=f"hash_{i}") doc.chunks = [Chunk(id=f"chunk_{i}", document_id=doc.id, text=f"Content for document {i}", chunk_index=0)] docs.append(doc) # Add documents concurrently tasks = [text_index.add_document(doc) for doc in docs] await asyncio.gather(*tasks) # Verify all documents were added with text_index.index.searcher() as searcher: assert searcher.doc_count_all() == 5 async def test_stemming_analyzer(self, temp_dir): """Test using stemming analyzer.""" config = MagicMock(spec=ServerConfig) config.whoosh_index_dir = str(temp_dir / "whoosh_stemming") config.whoosh_analyzer = "stemming" config.whoosh_min_score = 0.0 index = TextIndex(config) await index.initialize() # Add document with related terms doc = Document(id="doc_1", path="/test.pdf", checksum="abc") doc.chunks = [Chunk(id="chunk_1", document_id=doc.id, text="running runners run", chunk_index=0)] await index.add_document(doc) # Search should find stemmed variations results = await index.search("run", limit=5) assert len(results) > 0 async def test_minimum_score_threshold(self, test_config, sample_document): """Test minimum score threshold filtering.""" test_config.whoosh_min_score = 0.5 index = TextIndex(test_config) await index.initialize() await index.add_document(sample_document) # Search with high threshold results = await index.search("the", limit=10) # Common words should have lower scores and be filtered for result in results: assert result["score"] >= 0.5 async def test_error_handling_search(self, text_index): """Test error handling during search.""" # Search without initialization should handle gracefully results = await text_index.search("test", limit=5) assert results == [] async def test_close_index(self, text_index): """Test closing the index.""" await text_index.initialize() assert text_index.index is not None await text_index.close() assert text_index.index is None

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server