Crawl4AI+SearXNG MCP Server

test_agentic_search_quality.py•14.7 KiB

"""Quality tests for agentic search - verify actual functionality, not just "doesn't crash". These tests verify: 1. similarity_score > 0 for all results (embeddings work) 2. Results are relevant to query (contain query keywords) 3. Crawling stores data (chunks_stored > 0 after web_search) 4. Completeness improves after crawling new data REQUIREMENTS: - Qdrant running on localhost:6333 - SearXNG running on localhost:8080 - OPENAI_API_KEY set in environment - AGENTIC_SEARCH_ENABLED=true - NO MOCKS - all services must be real These are NOT flaky tests - they verify deterministic properties: - Embeddings produce non-zero similarity scores - Search results contain query terms - Database operations persist data """ import json from unittest.mock import MagicMock import httpx import pytest from fastmcp import Context from src.config import get_settings from src.core.context import get_app_context, initialize_global_context from src.database.qdrant_adapter import QdrantAdapter def create_mock_context() -> Context: """Create a mock FastMCP Context for testing. Uses MagicMock with spec=Context to satisfy type checker while providing a functional mock for tests. """ return MagicMock(spec=Context) @pytest.fixture async def initialized_context(): """Initialize app context for tests.""" settings = get_settings() # Check required services try: async with httpx.AsyncClient(timeout=5.0) as client: # Check Qdrant response = await client.get(f"{settings.qdrant_url}/collections") if response.status_code != 200: pytest.skip(f"Qdrant not available at {settings.qdrant_url}") # Check SearXNG response = await client.get(f"{settings.searxng_url}/") if response.status_code != 200: pytest.skip(f"SearXNG not available at {settings.searxng_url}") # Check SearXNG returns results response = await client.get( f"{settings.searxng_url}/search?q=test&format=json" ) if response.status_code != 200: pytest.skip("SearXNG JSON API not working") data = response.json() if not data.get("results"): pytest.skip("SearXNG returns no results") except Exception as e: pytest.skip(f"Required services not available: {e}") # Check OpenAI API key if not settings.openai_api_key: pytest.skip("OPENAI_API_KEY not set") # Check agentic search enabled if not settings.agentic_search_enabled: pytest.skip("AGENTIC_SEARCH_ENABLED not set to true") # Initialize context await initialize_global_context() yield get_app_context() @pytest.fixture async def clean_test_collection(initialized_context): """Create a clean test collection for isolation.""" settings = get_settings() adapter = QdrantAdapter(url=settings.qdrant_url) test_collection = "test_agentic_search_quality" # Delete if exists try: await adapter.client.delete_collection(test_collection) except Exception: pass # Create fresh collection from qdrant_client.models import Distance, VectorParams await adapter.client.create_collection( collection_name=test_collection, vectors_config=VectorParams(size=1536, distance=Distance.COSINE), ) yield test_collection # Cleanup try: await adapter.client.delete_collection(test_collection) except Exception: pass await adapter.client.close() class TestSimilarityScores: """Test that similarity scores are valid (not zero).""" @pytest.mark.asyncio @pytest.mark.integration async def test_similarity_scores_are_nonzero(self, initialized_context): """Verify that all returned results have similarity_score > 0. If similarity_score is 0.0, it means: - Embeddings are not being computed correctly, OR - Vector search is returning wrong field, OR - Score normalization is broken This is a deterministic test - embeddings should always produce non-zero scores. """ from src.services.agentic_search import agentic_search_impl result_json = await agentic_search_impl( ctx=create_mock_context(), query="What is Python programming language?", completeness_threshold=0.5, # Low threshold to get results quickly max_iterations=1, max_urls_per_iteration=1, ) result = json.loads(result_json) assert result["success"], f"Search failed: {result.get('error')}" assert len(result["results"]) > 0, "No results returned" # CRITICAL: All similarity scores must be > 0 zero_score_results = [ r for r in result["results"] if r["similarity_score"] == 0.0 ] assert len(zero_score_results) == 0, ( f"Found {len(zero_score_results)} results with similarity_score=0.0. " f"This indicates broken embeddings or vector search. " f"URLs with zero scores: {[r['url'] for r in zero_score_results]}" ) # Scores should be in valid range for r in result["results"]: assert 0.0 < r["similarity_score"] <= 1.0, ( f"Invalid similarity_score {r['similarity_score']} for {r['url']}" ) class TestResultRelevance: """Test that results are relevant to the query.""" @pytest.mark.asyncio @pytest.mark.integration async def test_results_contain_query_keywords(self, initialized_context): """Verify that results contain keywords from the query. This is NOT testing LLM behavior - it's testing that: - Vector search returns semantically related content - Results are not random/unrelated documents We use a specific technical query where results MUST contain the term. """ from src.services.agentic_search import agentic_search_impl # Use a specific technical term that must appear in relevant results query = "FastAPI Python web framework" keywords = ["fastapi", "python", "web", "framework", "api"] result_json = await agentic_search_impl( ctx=create_mock_context(), query=query, completeness_threshold=0.5, max_iterations=1, max_urls_per_iteration=2, ) result = json.loads(result_json) assert result["success"], f"Search failed: {result.get('error')}" if len(result["results"]) == 0: pytest.skip("No results in database for this query") # At least some results should contain query keywords results_with_keywords = 0 for r in result["results"]: content_lower = r["content"].lower() url_lower = r["url"].lower() combined = content_lower + " " + url_lower # Check if any keyword is present if any(kw in combined for kw in keywords): results_with_keywords += 1 # At least 50% of results should be relevant relevance_ratio = results_with_keywords / len(result["results"]) assert relevance_ratio >= 0.5, ( f"Only {relevance_ratio:.0%} of results contain query keywords. " f"Expected at least 50%. Query: '{query}'. " f"This indicates vector search is returning irrelevant results." ) class TestCrawlingPersistence: """Test that crawling actually stores data.""" @pytest.mark.asyncio @pytest.mark.integration async def test_crawling_stores_or_skips_duplicates(self, initialized_context): """Verify that web_search + crawl either stores new data or correctly skips duplicates. After agentic search with web crawling: - chunks_stored > 0 (new data stored), OR - urls_stored > 0 (new URLs stored), OR - crawl action shows urls were checked (duplicate detection working) The crawling system correctly skips URLs already in database. """ from src.services.agentic_search import agentic_search_impl # Use a query that will trigger web search result_json = await agentic_search_impl( ctx=create_mock_context(), query="Rust programming language memory safety borrow checker", completeness_threshold=0.95, # High threshold to force web search max_iterations=2, max_urls_per_iteration=3, ) result = json.loads(result_json) assert result["success"], f"Search failed: {result.get('error')}" # Find web_search actions in history web_search_actions = [ h for h in result["search_history"] if h["action"] == "web_search" ] if not web_search_actions: # If no web search was triggered, completeness was already high if result["completeness"] >= 0.95: pytest.skip("Completeness already high, web search not triggered") else: pytest.fail( f"Web search should have been triggered at completeness " f"{result['completeness']:.2f} < 0.95" ) # Verify web search found URLs urls_found = web_search_actions[0].get("urls_found", 0) promising_urls = web_search_actions[0].get("promising_urls", 0) assert urls_found > 0, "Web search should find some URLs" assert promising_urls > 0, "At least some URLs should be promising" # Check crawl actions crawl_actions = [h for h in result["search_history"] if h["action"] == "crawl"] # Either crawl stored data, or crawl was skipped (all duplicates) # Both are valid outcomes total_chunks_stored = sum( h.get("chunks_stored", 0) for h in result["search_history"] ) total_urls_stored = sum( h.get("urls_stored", 0) for h in result["search_history"] ) # If no crawl actions, URLs were all duplicates (valid) # If crawl actions exist, verify they have valid data if crawl_actions: # Crawl was attempted - verify structure for crawl in crawl_actions: assert "urls_stored" in crawl, "crawl action missing urls_stored" assert "chunks_stored" in crawl, "crawl action missing chunks_stored" # Log outcome for debugging if total_chunks_stored > 0 or total_urls_stored > 0: # New data was stored pass else: # All URLs were duplicates - this is valid behavior # The system correctly detected and skipped existing URLs pass class TestCompletenessImprovement: """Test that completeness improves after adding relevant data.""" @pytest.mark.asyncio @pytest.mark.integration async def test_completeness_improves_with_data(self, initialized_context): """Verify that completeness score increases after crawling relevant content. This tests the core value proposition of agentic search: 1. Start with low completeness (no/little data) 2. Crawl relevant URLs 3. Completeness should improve If completeness doesn't improve, the system is not learning. """ from src.services.agentic_search import agentic_search_impl # First search - should have low completeness for obscure topic query = "Zig programming language comptime metaprogramming" result_json = await agentic_search_impl( ctx=create_mock_context(), query=query, completeness_threshold=0.95, # High to force multiple iterations max_iterations=3, max_urls_per_iteration=2, ) result = json.loads(result_json) assert result["success"], f"Search failed: {result.get('error')}" # Get completeness values from search history completeness_values = [ h["completeness"] for h in result["search_history"] if h.get("completeness") is not None ] if len(completeness_values) < 2: pytest.skip("Not enough iterations to measure improvement") initial_completeness = completeness_values[0] final_completeness = result["completeness"] # If we crawled data, completeness should improve or stay high total_chunks = sum(h.get("chunks_stored", 0) for h in result["search_history"]) if total_chunks > 0: # We added data, completeness should not decrease significantly assert final_completeness >= initial_completeness - 0.1, ( f"Completeness decreased from {initial_completeness:.2f} to " f"{final_completeness:.2f} after storing {total_chunks} chunks. " f"This indicates the crawled content is not being used correctly." ) class TestSearchHistoryIntegrity: """Test that search history contains valid data.""" @pytest.mark.asyncio @pytest.mark.integration async def test_search_history_has_required_fields(self, initialized_context): """Verify search history entries have all required fields with valid values.""" from src.services.agentic_search import agentic_search_impl result_json = await agentic_search_impl( ctx=create_mock_context(), query="What is Docker containerization?", completeness_threshold=0.7, max_iterations=2, max_urls_per_iteration=2, ) result = json.loads(result_json) assert result["success"], f"Search failed: {result.get('error')}" assert len(result["search_history"]) > 0, "Empty search history" for i, entry in enumerate(result["search_history"]): # Required fields assert "action" in entry, f"Entry {i} missing 'action'" assert "iteration" in entry, f"Entry {i} missing 'iteration'" # Action-specific validation if entry["action"] == "local_check": assert "completeness" in entry, ( f"local_check entry {i} missing 'completeness'" ) assert entry["completeness"] is not None, ( f"local_check entry {i} has None completeness" ) assert 0.0 <= entry["completeness"] <= 1.0, ( f"local_check entry {i} has invalid completeness: " f"{entry['completeness']}" ) if entry["action"] == "web_search": assert "urls_found" in entry, ( f"web_search entry {i} missing 'urls_found'" ) assert entry["urls_found"] >= 0, ( f"web_search entry {i} has negative urls_found" ) if __name__ == "__main__": pytest.main([__file__, "-v", "-s"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/AI-enthusiasts/crawl4ai-rag-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_agentic_search_quality.py•14.7 KiB