"""Integration tests for agentic search with real OpenAI API.
This test suite uses gpt-4.1-nano (CHEAP model ~$0.00015/1K tokens) for
cost-effective integration testing WITHOUT mocks.
Prerequisites:
- OPENAI_API_KEY or TEST_OPENAI_API_KEY must be set
- TEST_MODEL_CHOICE=gpt-4.1-nano (or falls back to this)
- Qdrant running at localhost:6333
- SearXNG running at localhost:8080 (optional for full test)
- AGENTIC_SEARCH_ENABLED=true
Cost estimate per test run: ~$0.001-0.002 USD with gpt-4.1-nano
Run with: pytest tests/test_agentic_search_integration.py -v
"""
import json
import os
import pytest
from src.config import get_settings, reset_settings
from src.core.context import initialize_global_context
from src.core.exceptions import LLMError
from src.services.agentic_search import (
AgenticSearchConfig,
AgenticSearchService,
LocalKnowledgeEvaluator,
SelectiveCrawler,
URLRanker,
agentic_search_impl,
)
# Mark all tests as integration tests
pytestmark = pytest.mark.integration
@pytest.fixture(scope="module")
def test_settings():
"""Configure settings for integration tests."""
# Set test environment variables BEFORE reset
os.environ["AGENTIC_SEARCH_ENABLED"] = "true"
os.environ["AGENTIC_SEARCH_COMPLETENESS_THRESHOLD"] = "0.8" # Lower for testing
os.environ["AGENTIC_SEARCH_MAX_ITERATIONS"] = "2" # Fewer iterations for speed
os.environ["AGENTIC_SEARCH_MAX_URLS_PER_ITERATION"] = "2"
os.environ["AGENTIC_SEARCH_URL_SCORE_THRESHOLD"] = "0.6"
# Use test model if available, fallback to gpt-4.1-nano
if not os.getenv("TEST_MODEL_CHOICE"):
os.environ["MODEL_CHOICE"] = "gpt-4.1-nano"
else:
os.environ["MODEL_CHOICE"] = os.getenv("TEST_MODEL_CHOICE")
# Use test API key if available
if os.getenv("TEST_OPENAI_API_KEY"):
os.environ["OPENAI_API_KEY"] = os.getenv("TEST_OPENAI_API_KEY")
# Reset settings to reload from environment AFTER setting variables
reset_settings()
settings = get_settings()
# Validate required settings
if not settings.openai_api_key:
pytest.skip("OPENAI_API_KEY not set - skipping integration test")
yield settings
# Cleanup
reset_settings()
@pytest.fixture
async def app_context(test_settings):
"""Initialize application context for tests."""
try:
ctx = await initialize_global_context()
yield ctx
except Exception as e:
pytest.skip(f"Failed to initialize app context: {e}")
@pytest.fixture
async def mock_fastmcp_context():
"""Create a simple Context object for testing."""
class SimpleContext:
pass
return SimpleContext()
class TestAgenticSearchService:
"""Test AgenticSearchService with real OpenAI API."""
@pytest.mark.asyncio
async def test_service_initialization(self, test_settings):
"""Test that service initializes correctly with Pydantic AI agents."""
# Create components
config = AgenticSearchConfig()
evaluator = LocalKnowledgeEvaluator(config)
ranker = URLRanker(config)
crawler = SelectiveCrawler(config)
# Create service
service = AgenticSearchService(
evaluator=evaluator,
ranker=ranker,
crawler=crawler,
config=config,
)
# Verify components are set
assert service.evaluator is not None
assert service.ranker is not None
assert service.crawler is not None
assert service.openai_model is not None
assert service.model_name is not None
assert service.temperature >= 0 and service.temperature <= 1
assert (
service.completeness_threshold >= 0 and service.completeness_threshold <= 1
)
@pytest.mark.asyncio
@pytest.mark.slow
async def test_completeness_evaluation_with_real_llm(
self,
test_settings,
app_context,
):
"""Test completeness evaluation with real gpt-4.1-nano API call.
Cost: ~$0.0001 USD per call with gpt-4.1-nano
Note: This test requires OpenAI API access and will be skipped if unavailable.
"""
# Create components
config = AgenticSearchConfig()
evaluator = LocalKnowledgeEvaluator(config)
# Test with empty results - should score low
from src.services.agentic_models import RAGResult
empty_results = []
# Wrap in try-except to skip if API unavailable
try:
evaluation = await evaluator._evaluate_completeness(
query="What is Python?",
results=empty_results,
)
except LLMError:
pytest.skip("OpenAI API unavailable - skipping real LLM test")
assert evaluation.score >= 0.0
assert evaluation.score <= 1.0
assert evaluation.reasoning
assert isinstance(evaluation.gaps, list)
# Empty results should have low completeness
assert evaluation.score < 0.5
# Test with mock results - should score higher
mock_results = [
RAGResult(
content="Python is a high-level, interpreted programming language known for its simplicity and readability.",
url="https://example.com/python",
similarity_score=0.95,
chunk_index=0,
),
RAGResult(
content="Python supports multiple programming paradigms including procedural, object-oriented, and functional programming.",
url="https://example.com/python-features",
similarity_score=0.92,
chunk_index=0,
),
]
try:
evaluation_with_results = await evaluator._evaluate_completeness(
query="What is Python?",
results=mock_results,
)
except LLMError:
pytest.skip("OpenAI API unavailable - skipping real LLM test")
assert evaluation_with_results.score >= 0.0
assert evaluation_with_results.score <= 1.0
assert evaluation_with_results.reasoning
# With relevant results, score should be higher
assert evaluation_with_results.score > evaluation.score
@pytest.mark.asyncio
@pytest.mark.slow
async def test_url_ranking_with_real_llm(self, test_settings):
"""Test URL ranking with real gpt-4.1-nano API call.
Cost: ~$0.0002 USD per call with gpt-4.1-nano
Note: This test requires OpenAI API access and will be skipped if unavailable.
"""
# Create components
config = AgenticSearchConfig()
ranker = URLRanker(config)
mock_search_results = [
{
"title": "Official Python Tutorial",
"url": "https://docs.python.org/3/tutorial/",
"snippet": "The Python Tutorial — Python 3.12 documentation. This tutorial introduces the reader informally to the basic concepts...",
},
{
"title": "Python Wikipedia",
"url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
"snippet": "Python is a high-level, interpreted programming language with dynamic semantics...",
},
{
"title": "Python Snake Care Guide",
"url": "https://pets.example.com/python-care",
"snippet": "Learn how to care for your pet python snake. Housing, feeding, and health tips...",
},
]
try:
rankings = await ranker._rank_urls(
query="Python programming language tutorial",
gaps=["basic syntax", "getting started"],
search_results=mock_search_results,
)
except LLMError:
pytest.skip("OpenAI API unavailable - skipping real LLM test")
assert len(rankings) == 3
assert all(0.0 <= r.score <= 1.0 for r in rankings)
assert all(r.reasoning for r in rankings)
# Should be sorted by score descending
for i in range(len(rankings) - 1):
assert rankings[i].score >= rankings[i + 1].score
# Programming content should rank higher than snake care
python_doc_score = next(r.score for r in rankings if "docs.python.org" in r.url)
snake_care_score = next(
r.score for r in rankings if "pets.example.com" in r.url
)
assert python_doc_score > snake_care_score
@pytest.mark.asyncio
@pytest.mark.slow
async def test_query_refinement_with_real_llm(self, test_settings):
"""Test query refinement with real gpt-4.1-nano API call.
Cost: ~$0.0001 USD per call with gpt-4.1-nano
Note: This test requires OpenAI API access and will be skipped if unavailable.
"""
# Create components
config = AgenticSearchConfig()
evaluator = LocalKnowledgeEvaluator(config)
ranker = URLRanker(config)
crawler = SelectiveCrawler(config)
service = AgenticSearchService(
evaluator=evaluator,
ranker=ranker,
crawler=crawler,
config=config,
)
try:
refinement = await service._stage4_query_refinement(
original_query="What is Python?",
current_query="What is Python?",
gaps=["type system", "performance characteristics", "use cases"],
)
except LLMError:
pytest.skip("OpenAI API unavailable - skipping real LLM test")
assert refinement.original_query == "What is Python?"
assert refinement.current_query == "What is Python?"
assert len(refinement.refined_queries) > 0
assert len(refinement.refined_queries) <= 3
assert refinement.reasoning
# Refined queries should be different from original
assert any(q != "What is Python?" for q in refinement.refined_queries)
class TestAgenticSearchIntegration:
"""Integration tests for full agentic search pipeline."""
@pytest.mark.asyncio
@pytest.mark.slow
@pytest.mark.skipif(
not os.getenv("RUN_EXPENSIVE_TESTS"),
reason="Expensive test - set RUN_EXPENSIVE_TESTS=true to run",
)
async def test_full_agentic_search_pipeline(
self,
test_settings,
app_context,
mock_fastmcp_context,
):
"""Test full agentic search pipeline with real services.
WARNING: This test hits real OpenAI API, SearXNG, and crawls real URLs.
Cost: ~$0.002-0.005 USD per run with gpt-4.1-nano
Prerequisites:
- Qdrant running
- SearXNG running
- Set RUN_EXPENSIVE_TESTS=true to enable
"""
result_json = await agentic_search_impl(
ctx=mock_fastmcp_context,
query="What is pytest?", # Simple query for testing
completeness_threshold=0.85,
max_iterations=2,
max_urls_per_iteration=2,
url_score_threshold=0.6,
)
result = json.loads(result_json)
# Verify comprehensive result
assert result["success"] is True
assert result["query"] == "What is pytest?"
assert result["iterations"] >= 1
assert result["iterations"] <= 2
assert 0.0 <= result["completeness"] <= 1.0
assert isinstance(result["results"], list)
assert len(result["search_history"]) > 0
# Verify it attempted multiple stages
actions = {item["action"] for item in result["search_history"]}
assert "local_check" in actions # Stage 1 always runs
# If completeness was low, should have attempted web search
if result["completeness"] < 0.85:
assert (
"web_search" in actions or result["status"] == "max_iterations_reached"
)
@pytest.mark.asyncio
async def test_settings_validation_for_agentic_search():
"""Test that settings validation works correctly."""
reset_settings()
# Test with disabled agentic search
os.environ["AGENTIC_SEARCH_ENABLED"] = "false"
settings = get_settings()
assert settings.agentic_search_enabled is False
# Test with enabled agentic search
os.environ["AGENTIC_SEARCH_ENABLED"] = "true"
reset_settings()
settings = get_settings()
assert settings.agentic_search_enabled is True
# Test threshold validation
os.environ["AGENTIC_SEARCH_COMPLETENESS_THRESHOLD"] = "0.95"
reset_settings()
settings = get_settings()
assert settings.agentic_search_completeness_threshold == 0.95
# Test invalid threshold (should raise validation error)
os.environ["AGENTIC_SEARCH_COMPLETENESS_THRESHOLD"] = "1.5"
reset_settings()
# Pydantic validates thresholds - invalid values should raise error
import pydantic_core
with pytest.raises(pydantic_core._pydantic_core.ValidationError):
settings = get_settings()
# Reset to valid value
os.environ["AGENTIC_SEARCH_COMPLETENESS_THRESHOLD"] = "0.8"
reset_settings()
# Cleanup
reset_settings()
if __name__ == "__main__":
# Run tests with: python -m pytest tests/test_agentic_search_integration.py -v
pytest.main([__file__, "-v", "-s"])