Skip to main content
Glama
cbcoutinho

Nextcloud MCP Server

by cbcoutinho
test_semantic_search.py13.1 kB
"""Integration tests for semantic search with vector database. These tests validate the complete semantic search flow: 1. Initialize Qdrant collection with simple in-process embeddings 2. Index sample notes into vector database 3. Perform semantic search queries 4. Verify relevant results are returned Uses SimpleEmbeddingProvider for deterministic, in-process embeddings without requiring external services like Ollama. """ import tempfile from pathlib import Path import pytest from qdrant_client import AsyncQdrantClient from qdrant_client.models import Distance, PointStruct, VectorParams from nextcloud_mcp_server.embedding import SimpleEmbeddingProvider pytestmark = pytest.mark.integration @pytest.fixture async def simple_embedding_provider(): """Simple in-process embedding provider for testing.""" return SimpleEmbeddingProvider(dimension=384) @pytest.fixture async def qdrant_test_client(): """Qdrant client for testing (in-memory).""" client = AsyncQdrantClient(":memory:") yield client await client.close() @pytest.fixture async def test_collection(qdrant_test_client: AsyncQdrantClient): """Create test collection in Qdrant.""" collection_name = "test_semantic_search" # Create collection await qdrant_test_client.create_collection( collection_name=collection_name, vectors_config=VectorParams(size=384, distance=Distance.COSINE), ) yield collection_name # Cleanup try: await qdrant_test_client.delete_collection(collection_name) except Exception: pass @pytest.fixture def sample_notes(): """Sample notes for testing semantic search.""" return [ { "id": 1, "title": "Python Async Programming", "content": """# Python Async/Await Patterns ## Key Concepts - Use async def for coroutines - Use await for async operations - asyncio.gather() for parallel execution ## Best Practices Always use async context managers for resources. Avoid blocking operations in async code.""", "category": "Development", }, { "id": 2, "title": "Book Recommendations 2025", "content": """# Books to Read ## Fiction - The Midnight Library by Matt Haig - Project Hail Mary by Andy Weir ## Non-Fiction - Atomic Habits by James Clear - Deep Work by Cal Newport ## Technical - Designing Data-Intensive Applications by Martin Kleppmann""", "category": "Personal", }, { "id": 3, "title": "Chocolate Chip Cookie Recipe", "content": """# Classic Cookies ## Ingredients - 2 cups flour - 1 cup butter - 1 cup sugar - 2 eggs - 2 cups chocolate chips ## Instructions 1. Preheat oven to 375°F 2. Mix butter and sugar 3. Add eggs and vanilla 4. Mix in flour 5. Fold in chocolate chips 6. Bake 10-12 minutes""", "category": "Recipes", }, { "id": 4, "title": "Team Meeting Notes", "content": """# Q1 Planning Meeting ## Attendees - Alice, Bob, Charlie ## Discussion - Review Q4 deliverables - Plan Q1 sprints - Resource allocation ## Action Items - Alice: Draft timeline - Bob: Infrastructure review""", "category": "Work", }, ] async def test_simple_embedding_provider_deterministic(simple_embedding_provider): """Test that SimpleEmbeddingProvider generates deterministic embeddings.""" text = "Hello world this is a test" # Generate embedding twice embedding1 = await simple_embedding_provider.embed(text) embedding2 = await simple_embedding_provider.embed(text) # Should be identical assert embedding1 == embedding2 assert len(embedding1) == 384 # Should be normalized (unit length) import math norm = math.sqrt(sum(x * x for x in embedding1)) assert abs(norm - 1.0) < 1e-6 async def test_simple_embedding_provider_similarity(simple_embedding_provider): """Test that similar texts have higher cosine similarity.""" async def cosine_similarity(text1: str, text2: str) -> float: emb1 = await simple_embedding_provider.embed(text1) emb2 = await simple_embedding_provider.embed(text2) return sum(a * b for a, b in zip(emb1, emb2)) # Similar texts python_text1 = "Python async programming with asyncio" python_text2 = "Using async and await in Python" unrelated_text = "Chocolate chip cookie recipe" # Similar texts should have higher similarity similar_score = await cosine_similarity(python_text1, python_text2) unrelated_score = await cosine_similarity(python_text1, unrelated_text) assert similar_score > unrelated_score assert similar_score > 0.3 # Some semantic overlap assert unrelated_score < similar_score async def test_semantic_search_with_qdrant( qdrant_test_client: AsyncQdrantClient, test_collection: str, simple_embedding_provider: SimpleEmbeddingProvider, sample_notes: list[dict], ): """Test full semantic search flow with Qdrant.""" # Index all sample notes points = [] for note in sample_notes: content = f"{note['title']}\n\n{note['content']}" embedding = await simple_embedding_provider.embed(content) points.append( PointStruct( id=note["id"], # Use integer ID for in-memory Qdrant vector=embedding, payload={ "note_id": note["id"], "title": note["title"], "category": note["category"], "excerpt": content[:200], }, ) ) await qdrant_test_client.upsert( collection_name=test_collection, points=points, wait=True ) # Test Query 1: Search for Python programming query = "async programming patterns in Python" query_embedding = await simple_embedding_provider.embed(query) response = await qdrant_test_client.query_points( collection_name=test_collection, query=query_embedding, limit=3, score_threshold=0.0, ) # Should find Python note as top result assert len(response.points) > 0 assert response.points[0].payload["note_id"] == 1 assert "Python" in response.points[0].payload["title"] # Test Query 2: Search for books query = "good books to read recommendations" query_embedding = await simple_embedding_provider.embed(query) response = await qdrant_test_client.query_points( collection_name=test_collection, query=query_embedding, limit=3, score_threshold=0.0, ) # Should find book recommendations note assert len(response.points) > 0 top_result = response.points[0] assert top_result.payload["note_id"] == 2 assert "Book" in top_result.payload["title"] # Test Query 3: Search for recipes query = "how to bake cookies dessert" query_embedding = await simple_embedding_provider.embed(query) response = await qdrant_test_client.query_points( collection_name=test_collection, query=query_embedding, limit=3, score_threshold=0.0, ) # Should find recipe note assert len(response.points) > 0 # Recipe should be in top 2 results top_note_ids = [r.payload["note_id"] for r in response.points[:2]] assert 3 in top_note_ids async def test_semantic_search_with_filters( qdrant_test_client: AsyncQdrantClient, test_collection: str, simple_embedding_provider: SimpleEmbeddingProvider, sample_notes: list[dict], ): """Test semantic search with category filtering.""" from qdrant_client.models import FieldCondition, Filter, MatchValue # Index notes points = [] for note in sample_notes: content = f"{note['title']}\n\n{note['content']}" embedding = await simple_embedding_provider.embed(content) points.append( PointStruct( id=note["id"], # Use integer ID for in-memory Qdrant vector=embedding, payload={ "note_id": note["id"], "title": note["title"], "category": note["category"], }, ) ) await qdrant_test_client.upsert( collection_name=test_collection, points=points, wait=True ) # Search only in "Personal" category query = "books reading" query_embedding = await simple_embedding_provider.embed(query) response = await qdrant_test_client.query_points( collection_name=test_collection, query=query_embedding, query_filter=Filter( must=[FieldCondition(key="category", match=MatchValue(value="Personal"))] ), limit=3, ) # Should only return Personal category notes assert len(response.points) > 0 for result in response.points: assert result.payload["category"] == "Personal" async def test_semantic_search_empty_results( qdrant_test_client: AsyncQdrantClient, test_collection: str, simple_embedding_provider: SimpleEmbeddingProvider, ): """Test semantic search with no indexed content returns empty results.""" query = "test query" query_embedding = await simple_embedding_provider.embed(query) response = await qdrant_test_client.query_points( collection_name=test_collection, query=query_embedding, limit=10, ) assert len(response.points) == 0 async def test_batch_embedding(simple_embedding_provider: SimpleEmbeddingProvider): """Test batch embedding generation.""" texts = [ "First document about Python", "Second document about JavaScript", "Third document about TypeScript", ] embeddings = await simple_embedding_provider.embed_batch(texts) assert len(embeddings) == 3 assert all(len(emb) == 384 for emb in embeddings) # Each should be normalized import math for emb in embeddings: norm = math.sqrt(sum(x * x for x in emb)) assert abs(norm - 1.0) < 1e-6 async def test_qdrant_persistent_mode( simple_embedding_provider: SimpleEmbeddingProvider, sample_notes: list[dict], ): """Test Qdrant in persistent local mode with file storage.""" with tempfile.TemporaryDirectory() as tmpdir: storage_path = Path(tmpdir) / "qdrant_data" # Create first client with persistent storage using path parameter client1 = AsyncQdrantClient(path=str(storage_path)) try: collection_name = "test_persistent" # Create collection and index notes await client1.create_collection( collection_name=collection_name, vectors_config=VectorParams(size=384, distance=Distance.COSINE), ) # Index sample notes points = [] for note in sample_notes: content = f"{note['title']}\n\n{note['content']}" embedding = await simple_embedding_provider.embed(content) points.append( PointStruct( id=note["id"], vector=embedding, payload={ "note_id": note["id"], "title": note["title"], "category": note["category"], }, ) ) await client1.upsert( collection_name=collection_name, points=points, wait=True ) # Verify data was written count_result = await client1.count(collection_name=collection_name) assert count_result.count == len(sample_notes) # Close first client await client1.close() # Create new client with same storage path client2 = AsyncQdrantClient(path=str(storage_path)) try: # Data should persist - verify collection exists collections = await client2.get_collections() collection_names = [c.name for c in collections.collections] assert collection_name in collection_names # Verify indexed data persisted count_result = await client2.count(collection_name=collection_name) assert count_result.count == len(sample_notes) # Verify search still works query = "Python programming" query_embedding = await simple_embedding_provider.embed(query) response = await client2.query_points( collection_name=collection_name, query=query_embedding, limit=3, ) # Should find Python note as top result assert len(response.points) > 0 assert response.points[0].payload["note_id"] == 1 finally: await client2.close() finally: # Cleanup await client1.close()

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cbcoutinho/nextcloud-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server