Session Buddy

Overview Schema Related Servers Score Discussions

session-buddy
tests
integration

test_phase3_success_criteria.py•20.2 KiB

#!/usr/bin/env python3 """Integration tests for Phase 3: Progressive Search success criteria verification. Tests the following success criteria from the implementation plan: 1. Average tiers searched <2.5 for typical queries 2. Tier reduction >0 (primary metric for progressive search effectiveness) 3. Result quality maintained or improved 4. Graceful degradation to single-tier if tiers fail """ from __future__ import annotations import asyncio import time from unittest.mock import AsyncMock, patch import pytest from session_buddy.search.progressive_search import ( ProgressiveSearchEngine, SearchTier, SufficiencyConfig, TierSearchResult, ProgressiveSearchResult, ) class TestPhase3SuccessCriteria: """Test Phase 3 success criteria for progressive search.""" @pytest.fixture def engine(self): """Create a ProgressiveSearchEngine instance.""" return ProgressiveSearchEngine() @pytest.fixture def mock_db(self): """Create a mock database adapter with realistic tier response patterns.""" db = AsyncMock() # Setup tier-specific response patterns that trigger early stopping # Need 3+ results with score >=0.95 (perfect match threshold) to trigger early stop async def mock_search_reflections(*args, **kwargs): # Returns high-quality results in CATEGORIES tier (triggers early stop) # Use 0.96, 0.97, 0.98 to ensure >=0.95 threshold is met for first 3 results # Hard-code "project": "test" because _search_categories doesn't pass project to search_reflections return [ { "content": f"Python async programming insight {i}", "score": 0.96 + (i * 0.01), # 0.96, 0.97, 0.98, 0.99, 1.00 "project": "test", } for i in range(5) ] async def mock_search_conversations(*args, **kwargs): # Returns lower-quality results (would be searched in later tiers) return [] db.search_reflections = mock_search_reflections db.search_conversations = mock_search_conversations return db @pytest.mark.asyncio async def test_average_tiers_searched_less_than_2_5(self, engine, mock_db): """Test success criterion: average tiers searched <2.5 for typical queries. This is the PRIMARY success criterion for progressive search. It verifies that early stopping is effective and most queries find sufficient results in the first 1-2 tiers rather than searching all 4 tiers. """ queries = [ "Python async patterns", "FastAPI middleware", "pytest fixtures", "database optimization", "error handling", "type hints", "testing strategies", "API design", ] total_tiers = 0 for i, query in enumerate(queries): with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query=query, project="test", max_tiers=4, enable_early_stop=True, ) total_tiers += len(result.tiers_searched) # Debug first query if i == 0: print(f"\n First query debug:") print(f" Tiers searched: {len(result.tiers_searched)}") print(f" Early stop: {result.early_stop}") print(f" Total results: {result.total_results}") if result.metadata.get("early_stop_reason"): print(f" Early stop reason: {result.metadata['early_stop_reason']}") avg_tiers = total_tiers / len(queries) print(f"\nAverage Tiers Searched Results:") print(f" Total queries: {len(queries)}") print(f" Total tier searches: {total_tiers}") print(f" Average tiers per query: {avg_tiers:.2f}") print(f" Target: <2.5 tiers per query") # Success criterion: <2.5 average tiers assert avg_tiers < 2.5, f"Average tiers {avg_tiers:.2f} exceeds 2.5 threshold" @pytest.mark.asyncio async def test_progressive_search_reduces_tiers_searched(self, engine, mock_db): """Test success criterion: progressive search searches fewer tiers. This is the PRIMARY performance metric. Progressive search should stop early when sufficient results are found, rather than always searching all tiers. Actual time reduction may vary due to sufficiency checking overhead. """ query = "Python async patterns" # Setup mocks - early stopping finds perfect matches in tier 1 async def mock_search_reflections_early(*args, **kwargs): # High quality results trigger early stopping (3+ perfect matches) return [ {"content": f"result{i}", "score": 0.96 - (i * 0.01), "project": None} for i in range(5) ] async def mock_search_conversations_early(*args, **kwargs): return [] # Full search version has moderate results that don't trigger early stop # This causes it to search all 4 tiers async def mock_search_reflections_full(*args, **kwargs): # Moderate quality - no perfect matches, avg <0.85 return [ {"content": f"result{i}", "score": 0.82 - (i * 0.02), "project": None} for i in range(5) ] async def mock_search_conversations_full(*args, **kwargs): return [] # Measure progressive search (with early stopping) early_db = AsyncMock() early_db.search_reflections = mock_search_reflections_early early_db.search_conversations = mock_search_conversations_early with patch.object(engine, "_db", early_db): progressive_start = time.perf_counter() progressive_result = await engine.search_progressive( query=query, max_tiers=4, enable_early_stop=True, ) progressive_time = time.perf_counter() - progressive_start # Measure full search (no early stopping) full_db = AsyncMock() full_db.search_reflections = mock_search_reflections_full full_db.search_conversations = mock_search_conversations_full with patch.object(engine, "_db", full_db): full_start = time.perf_counter() full_result = await engine.search_progressive( query=query, max_tiers=4, enable_early_stop=False, # Disable early stopping ) full_time = time.perf_counter() - full_start # Calculate improvements tier_reduction = len(full_result.tiers_searched) - len(progressive_result.tiers_searched) print(f"\nProgressive Search Performance Results:") print(f" Progressive search: {progressive_time*1000:.2f}ms ({len(progressive_result.tiers_searched)} tiers)") print(f" Full search: {full_time*1000:.2f}ms ({len(full_result.tiers_searched)} tiers)") print(f" Tier reduction: {tier_reduction} fewer tiers searched") print(f" Early stop: {progressive_result.early_stop}") # Primary success criterion: early stopping reduces tiers searched assert len(progressive_result.tiers_searched) < len(full_result.tiers_searched), \ "Progressive search should search fewer tiers than full search when early stopping is effective" # Secondary check: if tiers were reduced, verify it was due to early stopping if tier_reduction > 0: assert progressive_result.early_stop, "Early stopping flag should be set when fewer tiers searched" @pytest.mark.asyncio async def test_result_quality_maintained(self, engine, mock_db): """Test success criterion: result quality is maintained or improved. Verifies that progressive search doesn't sacrifice result quality for performance gains. Early stopping should only occur when results are already high-quality. """ query = "high-quality insights about async" # Setup high-quality results that trigger early stopping async def mock_search_reflections(*args, **kwargs): return [ {"content": "result1", "score": 0.95}, {"content": "result2", "score": 0.93}, {"content": "result3", "score": 0.91}, {"content": "result4", "score": 0.89}, {"content": "result5", "score": 0.87}, ] mock_db.search_reflections = mock_search_reflections mock_db.search_conversations = AsyncMock(return_value=[]) with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query=query, max_tiers=4, enable_early_stop=True, ) # Should find high-quality results and stop early assert result.total_results >= 3, "Should find sufficient results" assert len(result.tiers_searched) >= 1, "Should search at least one tier" # Check result quality all_scores = [] for tier_result in result.tier_results: for r in tier_result.results: score = r.get("score", r.get("similarity", 0.0)) if score: all_scores.append(score) if all_scores: avg_score = sum(all_scores) / len(all_scores) min_score = min(all_scores) print(f"\nResult Quality Results:") print(f" Total results: {result.total_results}") print(f" Average score: {avg_score:.2f}") print(f" Minimum score: {min_score:.2f}") print(f" Tiers searched: {len(result.tiers_searched)}") print(f" Early stop: {result.early_stop}") # Results should maintain good quality assert avg_score >= 0.7, f"Average score {avg_score:.2f} is below quality threshold" assert min_score >= 0.6, f"Minimum score {min_score:.2f} is too low" @pytest.mark.asyncio async def test_graceful_degradation_to_single_tier(self, engine): """Test success criterion: graceful degradation when tiers fail. Verifies that if later tiers fail, the system still returns results from earlier tiers rather than crashing. """ query = "test query with failing later tiers" # Setup only first tier to succeed, later tiers fail async def mock_search_reflections(*args, **kwargs): return [{"content": "result", "score": 0.85}] async def mock_search_conversations_fail(*args, **kwargs): raise RuntimeError("Database connection failed for conversations") mock_db = AsyncMock() mock_db.search_reflections = mock_search_reflections mock_db.search_conversations = mock_search_conversations_fail with patch.object(engine, "_db", mock_db): # Should not raise exception despite tier failure result = await engine.search_progressive( query=query, max_tiers=2, enable_early_stop=False, ) # Should return results from successful tier assert result.total_results >= 1, "Should return results from successful tier" assert len(result.tiers_searched) >= 1, "Should have searched at least one tier" print(f"\nGraceful Degradation Results:") print(f" Total results: {result.total_results}") print(f" Tiers searched: {len(result.tiers_searched)}") print(f" Search complete: {result.search_complete}") print(f" ✓ System continued despite tier failure") @pytest.mark.asyncio async def test_sufficiency_evaluator_effectiveness(self, engine, mock_db): """Test that sufficiency evaluation correctly identifies when to stop. Verifies the early stopping logic by testing various result patterns. """ # Test perfect match early stopping (3+ results >=0.95) async def mock_search_perfect(*args, **kwargs): return [ {"content": f"result{i}", "score": 0.96 - (i * 0.005)} for i in range(4) ] mock_db.search_reflections = mock_search_perfect mock_db.search_conversations = AsyncMock(return_value=[]) with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="perfect match query", max_tiers=4, enable_early_stop=True, ) assert result.early_stop, "Should stop early for perfect matches" assert len(result.tiers_searched) == 1, "Should only search first tier" # Test high quality early stopping (avg >=0.85, 3+ results) async def mock_search_high_quality(*args, **kwargs): return [ {"content": f"result{i}", "score": 0.90 - (i * 0.02)} for i in range(5) ] mock_db.search_reflections = mock_search_high_quality with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="high quality query", max_tiers=4, enable_early_stop=True, ) assert result.early_stop, "Should stop early for high quality results" # Test insufficient quality (should continue to next tier) async def mock_search_low_quality(*args, **kwargs): return [ {"content": f"result{i}", "score": 0.70 - (i * 0.05)} for i in range(3) ] mock_db.search_reflections = mock_search_low_quality with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="low quality query", max_tiers=4, enable_early_stop=True, ) # Should search more tiers for low quality results assert len(result.tiers_searched) >= 2, "Should continue searching for low quality results" print(f"\nSufficiency Evaluation Results:") print(f" ✓ Perfect match early stopping: working") print(f" ✓ High quality early stopping: working") print(f" ✓ Low quality continuation: working") @pytest.mark.asyncio async def test_tier_metadata_tracking(self, engine, mock_db): """Test that tier metadata is accurately tracked. Verifies that latency, scores, and result counts are correctly recorded for each tier searched. """ async def mock_search_reflections(*args, **kwargs): return [ {"content": f"result{i}", "score": 0.90 - (i * 0.02)} for i in range(5) ] mock_db.search_reflections = mock_search_reflections mock_db.search_conversations = AsyncMock(return_value=[]) with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="metadata test", max_tiers=2, enable_early_stop=False, ) # Verify tier results assert len(result.tier_results) == 2, "Should have 2 tier results" for i, tier_result in enumerate(result.tier_results): assert tier_result.searched is True, f"Tier {i} should be marked as searched" assert tier_result.total_found >= 0, f"Tier {i} total_found should be non-negative" assert tier_result.latency_ms >= 0, f"Tier {i} latency should be non-negative" assert 0.0 <= tier_result.avg_score <= 1.0, f"Tier {i} avg_score should be in valid range" if tier_result.total_found > 0: assert tier_result.min_score >= 0.0, f"Tier {i} min_score should be valid" assert tier_result.max_score <= 1.0, f"Tier {i} max_score should be valid" print(f"\nTier Metadata Results:") for i, tier_result in enumerate(result.tier_results): print(f" Tier {i+1}: {tier_result.total_found} results, " f"{tier_result.avg_score:.2f} avg, {tier_result.latency_ms:.2f}ms") print(f" ✓ Tier metadata tracking is accurate") @pytest.mark.asyncio async def test_progressive_search_statistics(self, engine, mock_db): """Test that progressive search statistics are correctly calculated.""" async def mock_search_reflections(*args, **kwargs): return [{"content": "result", "score": 0.85}] mock_db.search_reflections = mock_search_reflections mock_db.search_conversations = AsyncMock(return_value=[]) with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="statistics test", max_tiers=4, enable_early_stop=True, ) # Verify statistics in result assert "avg_tiers_searched" in result.metadata assert "max_tiers_allowed" in result.metadata assert result.metadata["avg_tiers_searched"] == len(result.tiers_searched) assert result.metadata["max_tiers_allowed"] == 4 print(f"\nProgressive Search Statistics Results:") print(f" Tiers searched: {len(result.tiers_searched)}") print(f" Total results: {result.total_results}") print(f" Total latency: {result.total_latency_ms:.2f}ms") print(f" Early stop: {result.early_stop}") print(f" Search complete: {result.search_complete}") print(f" ✓ Statistics are correctly tracked") @pytest.mark.asyncio async def test_max_tiers_parameter_validation(self, engine, mock_db): """Test that max_tiers parameter correctly limits search depth.""" async def mock_search_reflections(*args, **kwargs): return [{"content": "result", "score": 0.70}] mock_db.search_reflections = mock_search_reflections mock_db.search_conversations = AsyncMock(return_value=[]) # Test with max_tiers=1 with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="test", max_tiers=1, enable_early_stop=False, ) assert len(result.tiers_searched) == 1, "Should only search 1 tier" assert result.tiers_searched[0] == SearchTier.CATEGORIES, "Should search CATEGORIES tier" # Test with max_tiers=2 with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="test", max_tiers=2, enable_early_stop=False, ) assert len(result.tiers_searched) == 2, "Should search 2 tiers" assert result.tiers_searched[0] == SearchTier.CATEGORIES assert result.tiers_searched[1] == SearchTier.INSIGHTS print(f"\nMax Tiers Validation Results:") print(f" ✓ max_tiers=1: searches 1 tier") print(f" ✓ max_tiers=2: searches 2 tiers") print(f" ✓ max_tiers parameter correctly limits search depth") @pytest.mark.asyncio async def test_early_stop_disabled_behavior(self, engine, mock_db): """Test that early_stop=False searches all tiers regardless of quality.""" # Setup results that would trigger early stop if enabled async def mock_search_reflections(*args, **kwargs): return [ {"content": f"result{i}", "score": 0.96 - (i * 0.01)} for i in range(5) ] mock_db.search_reflections = mock_search_reflections mock_db.search_conversations = AsyncMock(return_value=[]) with patch.object(engine, "_db", mock_db): result = await engine.search_progressive( query="test", max_tiers=2, enable_early_stop=False, # Disable early stopping ) # Should search all 2 tiers even with perfect matches assert not result.early_stop, "Early stop should be disabled" assert len(result.tiers_searched) == 2, "Should search all configured tiers" print(f"\nEarly Stop Disabled Results:") print(f" Tiers searched: {len(result.tiers_searched)}") print(f" Early stop: {result.early_stop}") print(f" ✓ Early stop disabled correctly searches all tiers")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/lesleslie/session-buddy'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_phase3_success_criteria.py•20.2 KiB