Skip to main content
Glama
juanqui
by juanqui
test_deduplication.py6.86 kB
"""Tests for deduplication functionality.""" from pathlib import Path from unittest.mock import Mock import pytest from src.pdfkb.config import ServerConfig from src.pdfkb.models import Chunk, Document from src.pdfkb.vector_store import VectorStore class TestChunkDeduplication: """Test cases for chunk deduplication logic.""" @pytest.fixture def config(self): """Create test configuration.""" return ServerConfig( openai_api_key="sk-test-key", knowledgebase_path=Path("./test_pdfs"), cache_dir=Path("./test_cache"), chunk_size=500, chunk_overlap=50, embedding_model="text-embedding-3-small", ) @pytest.fixture def sample_document(self): """Create a sample document with chunks.""" doc = Document( path="/test/sample.pdf", title="Test Document", checksum="abc123", file_size=1000, page_count=2, ) # Add chunks with deterministic content for testing chunk1 = Chunk( document_id=doc.id, text="This is the first chunk of text.", chunk_index=0, page_number=1, embedding=[0.1, 0.2, 0.3], ) chunk2 = Chunk( document_id=doc.id, text="This is the second chunk of text.", chunk_index=1, page_number=1, embedding=[0.4, 0.5, 0.6], ) doc.add_chunk(chunk1) doc.add_chunk(chunk2) return doc def test_chunk_deterministic_id_generation(self, sample_document): """Test that chunks generate deterministic IDs based on content.""" doc = sample_document chunk1, chunk2 = doc.chunks # IDs should be deterministic and different assert chunk1.id.startswith("chunk_") assert chunk2.id.startswith("chunk_") assert chunk1.id != chunk2.id # Same content should produce same ID duplicate_chunk = Chunk( document_id=doc.id, text="This is the first chunk of text.", chunk_index=0, page_number=1, embedding=[0.7, 0.8, 0.9], ) assert duplicate_chunk.id == chunk1.id def test_document_deterministic_id_generation(self): """Test that documents generate deterministic IDs based on path and checksum.""" doc1 = Document(path="/test/sample.pdf", checksum="abc123") doc2 = Document(path="/test/sample.pdf", checksum="abc123") doc3 = Document(path="/test/different.pdf", checksum="abc123") # Same path and checksum should produce same ID assert doc1.id == doc2.id assert doc1.id.startswith("doc_") # Different path should produce different ID assert doc1.id != doc3.id @pytest.mark.asyncio async def test_vector_store_filter_existing_chunks(self, config): """Test that vector store correctly filters out existing chunks.""" vector_store = VectorStore(config) # Mock the collection mock_collection = Mock() mock_collection.get.return_value = {"ids": ["chunk_existing1", "chunk_existing2"]} vector_store.collection = mock_collection # Create test chunks existing_chunk = Chunk(id="chunk_existing1", text="Existing chunk", embedding=[0.1, 0.2, 0.3]) new_chunk = Chunk(id="chunk_new1", text="New chunk", embedding=[0.4, 0.5, 0.6]) chunks = [existing_chunk, new_chunk] # Filter chunks new_chunks = await vector_store._filter_existing_chunks(chunks) # Should only return the new chunk assert len(new_chunks) == 1 assert new_chunks[0].id == "chunk_new1" # Verify collection.get was called with correct IDs mock_collection.get.assert_called_once_with(ids=["chunk_existing1", "chunk_new1"], include=["metadatas"]) @pytest.mark.asyncio async def test_add_document_with_duplicates(self, config, sample_document): """Test adding document with some duplicate chunks.""" vector_store = VectorStore(config) # Mock the collection mock_collection = Mock() mock_collection.get.return_value = {"ids": [sample_document.chunks[0].id]} # First chunk already exists mock_collection.add = Mock() vector_store.collection = mock_collection # Add document await vector_store.add_document(sample_document) # Should only add the second chunk (first is duplicate) mock_collection.add.assert_called_once() call_args = mock_collection.add.call_args[1] assert len(call_args["ids"]) == 1 assert call_args["ids"][0] == sample_document.chunks[1].id @pytest.mark.asyncio async def test_add_document_all_duplicates(self, config, sample_document): """Test adding document where all chunks are duplicates.""" vector_store = VectorStore(config) # Mock the collection - all chunks exist mock_collection = Mock() mock_collection.get.return_value = {"ids": [chunk.id for chunk in sample_document.chunks]} mock_collection.add = Mock() vector_store.collection = mock_collection # Add document await vector_store.add_document(sample_document) # Should not add any chunks mock_collection.add.assert_not_called() def test_chunk_content_changes_affect_id(self): """Test that changes to chunk content result in different IDs.""" base_chunk = Chunk(document_id="doc_123", text="Original text", chunk_index=0, page_number=1) modified_text_chunk = Chunk(document_id="doc_123", text="Modified text", chunk_index=0, page_number=1) modified_index_chunk = Chunk( document_id="doc_123", text="Original text", chunk_index=1, # Different index page_number=1, ) # Different content should produce different IDs assert base_chunk.id != modified_text_chunk.id assert base_chunk.id != modified_index_chunk.id assert modified_text_chunk.id != modified_index_chunk.id class TestDuplicateChunkScenarios: """Test various real-world duplicate chunk scenarios.""" @pytest.mark.asyncio async def test_reprocessing_same_file(self): """Test that reprocessing the same file doesn't create duplicates.""" # This would be tested with integration tests # where we process the same PDF twice and verify no duplicates pass @pytest.mark.asyncio async def test_file_monitor_and_manual_add_coordination(self): """Test that file monitor and manual add_document don't create duplicates.""" # This would test the coordination between different processing paths pass

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/juanqui/pdfkb-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server