Session Buddy

Overview Schema Related Servers Score Discussions

session-buddy
tests
integration

test_intent_detection_accuracy.py•14.7 KiB

"""Integration tests for Natural Language Intent Detection accuracy. Tests the accuracy of intent detection across various tools and phrasing variations. Target: >90% accuracy for common use cases. """ from __future__ import annotations import pytest from session_buddy.core.intent_detector import IntentDetector, ToolMatch @pytest.fixture async def detector() -> IntentDetector: """Initialize intent detector for testing.""" detector = IntentDetector() await detector.initialize() return detector class TestIntentDetectionAccuracy: """Test intent detection accuracy across various tools.""" @pytest.mark.asyncio async def test_checkpoint_phrases(self, detector: IntentDetector) -> None: """Test checkpoint detection with various phrasings.""" test_cases = [ ("save my progress", "checkpoint", 0.8), ("checkpoint this", "checkpoint", 0.8), ("create a checkpoint", "checkpoint", 0.8), ("I want to save", "checkpoint", 0.8), ("Time to checkpoint before the next feature", "checkpoint", 0.6), ("Let me save what I have so far", "checkpoint", 0.6), ] passed = 0 for message, expected_tool, min_confidence in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if result and result.tool_name == expected_tool and result.confidence >= min_confidence: passed += 1 else: print(f"❌ Failed: '{message}' -> expected {expected_tool}, got {result}") accuracy = passed / len(test_cases) print(f"\n✅ Checkpoint accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Checkpoint accuracy {accuracy:.1%} below 90% target" @pytest.mark.asyncio async def test_search_reflections_phrases(self, detector: IntentDetector) -> None: """Test search_reflections detection with various phrasings.""" test_cases = [ ("what did I learn about async?", "search_reflections", 0.7), ("find insights on authentication", "search_reflections", 0.7), ("search for database patterns", "search_reflections", 0.7), ("What do I know about optimization?", "search_reflections", 0.7), ("Any insights on error handling?", "search_reflections", 0.7), ] passed = 0 for message, expected_tool, min_confidence in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if result and result.tool_name == expected_tool and result.confidence >= min_confidence: passed += 1 else: print(f"❌ Failed: '{message}' -> expected {expected_tool}, got {result}") accuracy = passed / len(test_cases) print(f"\n✅ Search reflections accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Search reflections accuracy {accuracy:.1%} below 90% target" @pytest.mark.asyncio async def test_quality_monitor_phrases(self, detector: IntentDetector) -> None: """Test quality_monitor detection with various phrasings.""" test_cases = [ ("how's the code quality", "quality_monitor", 0.8), ("check quality", "quality_monitor", 0.8), ("analyze project health", "quality_monitor", 0.8), ("What's the current project health?", "quality_monitor", 0.6), ("Check the quality of my recent work", "quality_monitor", 0.6), ] passed = 0 for message, expected_tool, min_confidence in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if result and result.tool_name == expected_tool and result.confidence >= min_confidence: passed += 1 else: print(f"❌ Failed: '{message}' -> expected {expected_tool}, got {result}") accuracy = passed / len(test_cases) print(f"\n✅ Quality monitor accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Quality monitor accuracy {accuracy:.1%} below 90% target" @pytest.mark.asyncio async def test_store_reflection_phrases(self, detector: IntentDetector) -> None: """Test store_reflection detection with various phrasings.""" test_cases = [ ("remember that", "store_reflection", 0.8), ("save as insight", "store_reflection", 0.8), ("create a reflection", "store_reflection", 0.8), ("note this", "store_reflection", 0.8), ("Remember that we fixed this by using retries", "store_reflection", 0.6), ] passed = 0 for message, expected_tool, min_confidence in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if result and result.tool_name == expected_tool and result.confidence >= min_confidence: passed += 1 else: print(f"❌ Failed: '{message}' -> expected {expected_tool}, got {result}") accuracy = passed / len(test_cases) print(f"\n✅ Store reflection accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Store reflection accuracy {accuracy:.1%} below 90% target" @pytest.mark.asyncio async def test_crackerjack_health_check_phrases(self, detector: IntentDetector) -> None: """Test crackerjack_health_check detection with various phrasings.""" test_cases = [ ("check crackerjack", "crackerjack_health_check", 0.8), ("crackerjack status", "crackerjack_health_check", 0.8), ("is crackerjack working", "crackerjack_health_check", 0.8), ("Is crackerjack working properly?", "crackerjack_health_check", 0.6), ] passed = 0 for message, expected_tool, min_confidence in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if result and result.tool_name == expected_tool and result.confidence >= min_confidence: passed += 1 else: print(f"❌ Failed: '{message}' -> expected {expected_tool}, got {result}") accuracy = passed / len(test_cases) print(f"\n✅ Crackerjack health check accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Crackerjack health check accuracy {accuracy:.1%} below 90% target" class TestArgumentExtraction: """Test argument extraction from natural language.""" @pytest.mark.asyncio async def test_search_reflections_argument_extraction(self, detector: IntentDetector) -> None: """Test argument extraction for search_reflections.""" test_cases = [ ("what did I learn about async?", "async"), ("find insights on authentication", "authentication"), ("search for database patterns", "database patterns"), ] passed = 0 for message, expected_query in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if ( result and result.tool_name == "search_reflections" and result.extracted_args.get("query") == expected_query ): passed += 1 else: print(f"❌ Failed: '{message}' -> expected query '{expected_query}', got {result.extracted_args if result else {}}") accuracy = passed / len(test_cases) print(f"\n✅ Argument extraction accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Argument extraction accuracy {accuracy:.1%} below 90% target" @pytest.mark.asyncio async def test_query_similar_errors_argument_extraction(self, detector: IntentDetector) -> None: """Test argument extraction for query_similar_errors.""" test_cases = [ ("have I seen this TypeError before?", "TypeError"), ("have I encountered this timeout error?", "timeout"), ("similar import errors", "import errors"), ] passed = 0 for message, expected_error in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if ( result and result.tool_name == "query_similar_errors" and result.extracted_args.get("error_message") == expected_error ): passed += 1 else: print(f"❌ Failed: '{message}' -> expected error '{expected_error}', got {result.extracted_args if result else {}}") accuracy = passed / len(test_cases) print(f"\n✅ Error argument extraction accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Error argument extraction accuracy {accuracy:.1%} below 90% target" class TestDisambiguation: """Test disambiguation handling for ambiguous inputs.""" @pytest.mark.asyncio async def test_low_confidence_returns_none(self, detector: IntentDetector) -> None: """Test that low confidence inputs return None.""" # Very ambiguous message result = await detector.detect_intent("do something", confidence_threshold=0.7) assert result is None, "Low confidence input should return None" @pytest.mark.asyncio async def test_suggestions_for_ambiguous_input(self, detector: IntentDetector) -> None: """Test that suggestions are provided for ambiguous input.""" # "check quality" could match quality_monitor or crackerjack_health_check suggestions = await detector.get_suggestions("check quality", limit=3) assert len(suggestions) > 0, "Should provide suggestions for ambiguous input" assert all("tool" in s for s in suggestions), "All suggestions should have tool names" assert all("confidence" in s for s in suggestions), "All suggestions should have confidence" class TestConfidenceScoring: """Test confidence scoring mechanism.""" @pytest.mark.asyncio async def test_pattern_match_confidence(self, detector: IntentDetector) -> None: """Test that pattern matches get expected confidence.""" result = await detector.detect_intent("save my progress", confidence_threshold=0.5) assert result is not None assert result.confidence == 0.8, "Pattern match should have 0.8 confidence" @pytest.mark.asyncio async def test_semantic_match_confidence_range(self, detector: IntentDetector) -> None: """Test that semantic matches have appropriate confidence range.""" # This should match semantically result = await detector.detect_intent("I've made good progress", confidence_threshold=0.5) if result: assert 0.0 <= result.confidence <= 1.0, "Confidence should be between 0 and 1" @pytest.mark.asyncio async def test_confidence_threshold_filtering(self, detector: IntentDetector) -> None: """Test that confidence threshold filters results.""" # Very high threshold should filter out most matches result_strict = await detector.detect_intent("check it", confidence_threshold=0.95) assert result_strict is None, "High threshold should filter low confidence matches" # Lower threshold should allow matches result_lenient = await detector.detect_intent("check it", confidence_threshold=0.3) # May or may not match depending on pattern strength class TestOverallAccuracy: """Overall accuracy tests across all tools.""" @pytest.mark.asyncio async def test_overall_accuracy_target(self, detector: IntentDetector) -> None: """Test overall accuracy across diverse toolset (>90% target).""" # Representative sample from different tool categories test_cases = [ # Session management ("save my progress", "checkpoint"), ("end my session", "end"), ("session status", "status"), # Search & memory ("what did I learn about async?", "search_reflections"), ("find code related to dependency injection", "search_by_concept"), ("what did I do to auth.py?", "search_by_file"), # Quality & monitoring ("how's the code quality", "quality_monitor"), ("check crackerjack", "crackerjack_health_check"), # Reflection & learning ("remember that", "store_reflection"), ("how can I improve?", "suggest_workflow_improvements"), # Advanced features ("what files am I editing?", "get_active_files"), ("activity summary", "get_activity_summary"), ] passed = 0 for message, expected_tool in test_cases: result = await detector.detect_intent(message, confidence_threshold=0.5) if result and result.tool_name == expected_tool: passed += 1 else: print(f"❌ Failed: '{message}' -> expected {expected_tool}, got {result.tool_name if result else None}") accuracy = passed / len(test_cases) print(f"\n✅ Overall accuracy: {accuracy:.1%} ({passed}/{len(test_cases)})") assert accuracy >= 0.9, f"Overall accuracy {accuracy:.1%} below 90% target" class TestEdgeCases: """Test edge cases and error handling.""" @pytest.mark.asyncio async def test_empty_message(self, detector: IntentDetector) -> None: """Test handling of empty messages.""" result = await detector.detect_intent("", confidence_threshold=0.7) assert result is None, "Empty message should return None" @pytest.mark.asyncio async def test_whitespace_only_message(self, detector: IntentDetector) -> None: """Test handling of whitespace-only messages.""" result = await detector.detect_intent(" ", confidence_threshold=0.7) assert result is None, "Whitespace-only message should return None" @pytest.mark.asyncio async def test_unknown_command(self, detector: IntentDetector) -> None: """Test handling of unknown commands.""" result = await detector.detect_intent("xyz123", confidence_threshold=0.7) assert result is None, "Unknown command should return None" @pytest.mark.asyncio async def test_graceful_degradation(self, detector: IntentDetector) -> None: """Test graceful degradation when embeddings fail.""" # This test verifies the system works even when semantic matching fails # Pattern matching should still work result = await detector.detect_intent("save my progress", confidence_threshold=0.5) assert result is not None, "Should fallback to pattern matching" assert result.tool_name == "checkpoint", "Pattern match should work"

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/lesleslie/session-buddy'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_intent_detection_accuracy.py•14.7 KiB