test_planner_validation.py•30.3 kB
#!/usr/bin/env python3
"""
PlannerWorkflow Tool Validation Test
Tests the planner tool's capabilities using the new workflow architecture.
This validates that the new workflow-based implementation maintains all the
functionality of the original planner tool while using the workflow pattern
like the debug tool.
"""
import json
from typing import Optional
from .conversation_base_test import ConversationBaseTest
class PlannerValidationTest(ConversationBaseTest):
"""Test planner tool with new workflow architecture"""
@property
def test_name(self) -> str:
return "planner_validation"
@property
def test_description(self) -> str:
return "PlannerWorkflow tool validation with new workflow architecture"
def run_test(self) -> bool:
"""Test planner tool capabilities"""
# Set up the test environment
self.setUp()
try:
self.logger.info("Test: PlannerWorkflow tool validation (new architecture)")
# Test 1: Single planning session with workflow architecture
if not self._test_single_planning_session():
return False
# Test 2: Planning with continuation using workflow
if not self._test_planning_with_continuation():
return False
# Test 3: Complex plan with deep thinking pauses
if not self._test_complex_plan_deep_thinking():
return False
# Test 4: Self-contained completion (no expert analysis)
if not self._test_self_contained_completion():
return False
# Test 5: Branching and revision with workflow
if not self._test_branching_and_revision():
return False
# Test 6: Workflow file context behavior
if not self._test_workflow_file_context():
return False
self.logger.info(" ✅ All planner validation tests passed")
return True
except Exception as e:
self.logger.error(f"PlannerWorkflow validation test failed: {e}")
return False
def _test_single_planning_session(self) -> bool:
"""Test a complete planning session with workflow architecture"""
try:
self.logger.info(" 1.1: Testing single planning session with workflow")
# Step 1: Start planning
self.logger.info(" 1.1.1: Step 1 - Initial planning step")
response1, continuation_id = self.call_mcp_tool(
"planner",
{
"step": "I need to plan a comprehensive API redesign for our legacy system. Let me start by analyzing the current state and identifying key requirements for the new API architecture.",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to get initial planning response")
return False
# Parse and validate JSON response
response1_data = self._parse_planner_response(response1)
if not response1_data:
return False
# Validate step 1 response structure - expect pause_for_planner for next_step_required=True
if not self._validate_step_response(response1_data, 1, 4, True, "pause_for_planner"):
return False
# Debug: Log the actual response structure to see what we're getting
self.logger.debug(f"Response structure: {list(response1_data.keys())}")
# Check workflow-specific response structure (more flexible)
status_key = None
for key in response1_data.keys():
if key.endswith("_status"):
status_key = key
break
if not status_key:
self.logger.error(f"Missing workflow status field in response: {list(response1_data.keys())}")
return False
self.logger.debug(f"Found status field: {status_key}")
# Check required_actions for workflow guidance
if not response1_data.get("required_actions"):
self.logger.error("Missing required_actions in workflow response")
return False
self.logger.info(f" ✅ Step 1 successful with workflow, continuation_id: {continuation_id}")
# Step 2: Continue planning
self.logger.info(" 1.1.2: Step 2 - API domain analysis")
response2, _ = self.call_mcp_tool(
"planner",
{
"step": "After analyzing the current API, I can identify three main domains: User Management, Content Management, and Analytics. Let me design the new API structure with RESTful endpoints and proper versioning.",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response2:
self.logger.error("Failed to continue planning to step 2")
return False
response2_data = self._parse_planner_response(response2)
if not self._validate_step_response(response2_data, 2, 4, True, "pause_for_planner"):
return False
# Check step history tracking in workflow (more flexible)
status_key = None
for key in response2_data.keys():
if key.endswith("_status"):
status_key = key
break
if status_key:
workflow_status = response2_data.get(status_key, {})
step_history_length = workflow_status.get("step_history_length", 0)
if step_history_length < 2:
self.logger.error(f"Step history not properly tracked in workflow: {step_history_length}")
return False
self.logger.debug(f"Step history length: {step_history_length}")
else:
self.logger.warning("No workflow status found, skipping step history check")
self.logger.info(" ✅ Step 2 successful with workflow tracking")
# Step 3: Final step - should trigger completion
self.logger.info(" 1.1.3: Step 3 - Final planning step")
response3, _ = self.call_mcp_tool(
"planner",
{
"step": "API redesign plan complete: Phase 1 - User Management API, Phase 2 - Content Management API, Phase 3 - Analytics API. Each phase includes proper authentication, rate limiting, and comprehensive documentation.",
"step_number": 3,
"total_steps": 3, # Adjusted total
"next_step_required": False, # Final step - should complete without expert analysis
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response3:
self.logger.error("Failed to complete planning session")
return False
response3_data = self._parse_planner_response(response3)
if not response3_data:
return False
# Validate final response structure - should be self-contained completion
if response3_data.get("status") != "planner_complete":
self.logger.error(f"Expected status 'planner_complete', got '{response3_data.get('status')}'")
return False
if not response3_data.get("planning_complete"):
self.logger.error("Expected planning_complete=true for final step")
return False
# Should NOT have expert_analysis (self-contained)
if "expert_analysis" in response3_data:
self.logger.error("PlannerWorkflow should be self-contained without expert analysis")
return False
# Check plan_summary exists
if not response3_data.get("plan_summary"):
self.logger.error("Missing plan_summary in final step")
return False
self.logger.info(" ✅ Planning session completed successfully with workflow architecture")
# Store continuation_id for next test
self.api_continuation_id = continuation_id
return True
except Exception as e:
self.logger.error(f"Single planning session test failed: {e}")
return False
def _test_planning_with_continuation(self) -> bool:
"""Test planning continuation with workflow architecture"""
try:
self.logger.info(" 1.2: Testing planning continuation with workflow")
# Use continuation from previous test if available
continuation_id = getattr(self, "api_continuation_id", None)
if not continuation_id:
# Start fresh if no continuation available
self.logger.info(" 1.2.0: Starting fresh planning session")
response0, continuation_id = self.call_mcp_tool(
"planner",
{
"step": "Planning API security strategy",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"model": "flash",
},
)
if not response0 or not continuation_id:
self.logger.error("Failed to start fresh planning session")
return False
# Test continuation step
self.logger.info(" 1.2.1: Continue planning session")
response1, _ = self.call_mcp_tool(
"planner",
{
"step": "Building on the API redesign, let me now plan the security implementation with OAuth 2.0, API keys, and rate limiting strategies.",
"step_number": 2,
"total_steps": 2,
"next_step_required": True,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response1:
self.logger.error("Failed to continue planning")
return False
response1_data = self._parse_planner_response(response1)
if not response1_data:
return False
# Validate continuation behavior
if not self._validate_step_response(response1_data, 2, 2, True, "pause_for_planner"):
return False
# Check that continuation_id is preserved
if response1_data.get("continuation_id") != continuation_id:
self.logger.error("Continuation ID not preserved in workflow")
return False
self.logger.info(" ✅ Planning continuation working with workflow")
return True
except Exception as e:
self.logger.error(f"Planning continuation test failed: {e}")
return False
def _test_complex_plan_deep_thinking(self) -> bool:
"""Test complex plan with deep thinking pauses"""
try:
self.logger.info(" 1.3: Testing complex plan with deep thinking pauses")
# Start complex plan (≥5 steps) - should trigger deep thinking
self.logger.info(" 1.3.1: Step 1 of complex plan (should trigger deep thinking)")
response1, continuation_id = self.call_mcp_tool(
"planner",
{
"step": "I need to plan a complete digital transformation for our enterprise organization, including cloud migration, process automation, and cultural change management.",
"step_number": 1,
"total_steps": 8, # Complex plan ≥5 steps
"next_step_required": True,
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start complex planning")
return False
response1_data = self._parse_planner_response(response1)
if not response1_data:
return False
# Should trigger deep thinking pause for complex plan
if response1_data.get("status") != "pause_for_deep_thinking":
self.logger.error("Expected deep thinking pause for complex plan step 1")
return False
if not response1_data.get("thinking_required"):
self.logger.error("Expected thinking_required=true for complex plan")
return False
# Check required thinking actions
required_thinking = response1_data.get("required_thinking", [])
if len(required_thinking) < 4:
self.logger.error("Expected comprehensive thinking requirements for complex plan")
return False
# Check for deep thinking guidance in next_steps
next_steps = response1_data.get("next_steps", "")
if "MANDATORY" not in next_steps or "deep thinking" not in next_steps.lower():
self.logger.error("Expected mandatory deep thinking guidance")
return False
self.logger.info(" ✅ Complex plan step 1 correctly triggered deep thinking pause")
# Step 2 of complex plan - should also trigger deep thinking
self.logger.info(" 1.3.2: Step 2 of complex plan (should trigger deep thinking)")
response2, _ = self.call_mcp_tool(
"planner",
{
"step": "After deep analysis, I can see this transformation requires three parallel tracks: Technical Infrastructure, Business Process, and Human Capital. Let me design the coordination strategy.",
"step_number": 2,
"total_steps": 8,
"next_step_required": True,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response2:
self.logger.error("Failed to continue complex planning")
return False
response2_data = self._parse_planner_response(response2)
if not response2_data:
return False
# Step 2 should also trigger deep thinking for complex plans
if response2_data.get("status") != "pause_for_deep_thinking":
self.logger.error("Expected deep thinking pause for complex plan step 2")
return False
self.logger.info(" ✅ Complex plan step 2 correctly triggered deep thinking pause")
# Step 4 of complex plan - should use normal flow (after step 3)
self.logger.info(" 1.3.3: Step 4 of complex plan (should use normal flow)")
response4, _ = self.call_mcp_tool(
"planner",
{
"step": "Now moving to tactical planning: Phase 1 execution details with specific timelines and resource allocation for the technical infrastructure track.",
"step_number": 4,
"total_steps": 8,
"next_step_required": True,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response4:
self.logger.error("Failed to continue to step 4")
return False
response4_data = self._parse_planner_response(response4)
if not response4_data:
return False
# Step 4 should use normal flow (no more deep thinking pauses)
if response4_data.get("status") != "pause_for_planner":
self.logger.error("Expected normal planning flow for step 4")
return False
if response4_data.get("thinking_required"):
self.logger.error("Step 4 should not require special thinking pause")
return False
self.logger.info(" ✅ Complex plan transitions to normal flow after step 3")
return True
except Exception as e:
self.logger.error(f"Complex plan deep thinking test failed: {e}")
return False
def _test_self_contained_completion(self) -> bool:
"""Test self-contained completion without expert analysis"""
try:
self.logger.info(" 1.4: Testing self-contained completion")
# Simple planning session that should complete without expert analysis
self.logger.info(" 1.4.1: Simple planning session")
response1, continuation_id = self.call_mcp_tool(
"planner",
{
"step": "Planning a simple website redesign with new color scheme and improved navigation.",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start simple planning")
return False
# Final step - should complete without expert analysis
self.logger.info(" 1.4.2: Final step - self-contained completion")
response2, _ = self.call_mcp_tool(
"planner",
{
"step": "Website redesign plan complete: Phase 1 - Update color palette and typography, Phase 2 - Redesign navigation structure and user flows.",
"step_number": 2,
"total_steps": 2,
"next_step_required": False, # Final step
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response2:
self.logger.error("Failed to complete simple planning")
return False
response2_data = self._parse_planner_response(response2)
if not response2_data:
return False
# Validate self-contained completion
if response2_data.get("status") != "planner_complete":
self.logger.error("Expected self-contained completion status")
return False
# Should NOT call expert analysis
if "expert_analysis" in response2_data:
self.logger.error("PlannerWorkflow should not call expert analysis")
return False
# Should have planning_complete flag
if not response2_data.get("planning_complete"):
self.logger.error("Expected planning_complete=true")
return False
# Should have plan_summary
if not response2_data.get("plan_summary"):
self.logger.error("Expected plan_summary in completion")
return False
# Check completion instructions
output = response2_data.get("output", {})
if not output.get("instructions"):
self.logger.error("Missing output instructions for plan presentation")
return False
self.logger.info(" ✅ Self-contained completion working correctly")
return True
except Exception as e:
self.logger.error(f"Self-contained completion test failed: {e}")
return False
def _test_branching_and_revision(self) -> bool:
"""Test branching and revision with workflow architecture"""
try:
self.logger.info(" 1.5: Testing branching and revision with workflow")
# Start planning session for branching test
self.logger.info(" 1.5.1: Start planning for branching test")
response1, continuation_id = self.call_mcp_tool(
"planner",
{
"step": "Planning mobile app development strategy with different technology options to evaluate.",
"step_number": 1,
"total_steps": 4,
"next_step_required": True,
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start branching test")
return False
# Create branch
self.logger.info(" 1.5.2: Create branch for React Native approach")
response2, _ = self.call_mcp_tool(
"planner",
{
"step": "Branch A: React Native approach - cross-platform development with shared codebase, faster development cycle, and consistent UI across platforms.",
"step_number": 2,
"total_steps": 4,
"next_step_required": True,
"is_branch_point": True,
"branch_from_step": 1,
"branch_id": "react-native",
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response2:
self.logger.error("Failed to create branch")
return False
response2_data = self._parse_planner_response(response2)
if not response2_data:
return False
# Validate branching in workflow
metadata = response2_data.get("metadata", {})
if not metadata.get("is_branch_point"):
self.logger.error("Branch point not recorded in workflow")
return False
if metadata.get("branch_id") != "react-native":
self.logger.error("Branch ID not properly recorded")
return False
if "react-native" not in metadata.get("branches", []):
self.logger.error("Branch not added to branches list")
return False
self.logger.info(" ✅ Branching working with workflow architecture")
# Test revision
self.logger.info(" 1.5.3: Test revision capability")
response3, _ = self.call_mcp_tool(
"planner",
{
"step": "Revision of step 2: After consideration, let me revise the React Native approach to include performance optimizations and native module integration for critical features.",
"step_number": 3,
"total_steps": 4,
"next_step_required": True,
"is_step_revision": True,
"revises_step_number": 2,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response3:
self.logger.error("Failed to create revision")
return False
response3_data = self._parse_planner_response(response3)
if not response3_data:
return False
# Validate revision in workflow
metadata = response3_data.get("metadata", {})
if not metadata.get("is_step_revision"):
self.logger.error("Step revision not recorded in workflow")
return False
if metadata.get("revises_step_number") != 2:
self.logger.error("Revised step number not properly recorded")
return False
self.logger.info(" ✅ Revision working with workflow architecture")
return True
except Exception as e:
self.logger.error(f"Branching and revision test failed: {e}")
return False
def _test_workflow_file_context(self) -> bool:
"""Test workflow file context behavior (should be minimal for planner)"""
try:
self.logger.info(" 1.6: Testing workflow file context behavior")
# Planner typically doesn't use files, but test the workflow handles this correctly
self.logger.info(" 1.6.1: Planning step with no files (normal case)")
response1, continuation_id = self.call_mcp_tool(
"planner",
{
"step": "Planning data architecture for analytics platform.",
"step_number": 1,
"total_steps": 2,
"next_step_required": True,
"model": "flash",
},
)
if not response1 or not continuation_id:
self.logger.error("Failed to start workflow file context test")
return False
response1_data = self._parse_planner_response(response1)
if not response1_data:
return False
# Planner workflow should not have file_context since it doesn't use files
if "file_context" in response1_data:
self.logger.info(" ℹ️ Workflow file context present but should be minimal for planner")
# Final step
self.logger.info(" 1.6.2: Final step (should complete without file embedding)")
response2, _ = self.call_mcp_tool(
"planner",
{
"step": "Data architecture plan complete with data lakes, processing pipelines, and analytics layers.",
"step_number": 2,
"total_steps": 2,
"next_step_required": False,
"continuation_id": continuation_id,
"model": "flash",
},
)
if not response2:
self.logger.error("Failed to complete workflow file context test")
return False
response2_data = self._parse_planner_response(response2)
if not response2_data:
return False
# Final step should complete self-contained
if response2_data.get("status") != "planner_complete":
self.logger.error("Expected self-contained completion for planner workflow")
return False
self.logger.info(" ✅ Workflow file context behavior appropriate for planner")
return True
except Exception as e:
self.logger.error(f"Workflow file context test failed: {e}")
return False
def call_mcp_tool(self, tool_name: str, params: dict) -> tuple[Optional[str], Optional[str]]:
"""Call an MCP tool in-process - override for planner-specific response handling"""
# Use in-process implementation to maintain conversation memory
response_text, _ = self.call_mcp_tool_direct(tool_name, params)
if not response_text:
return None, None
# Extract continuation_id from planner response specifically
continuation_id = self._extract_planner_continuation_id(response_text)
return response_text, continuation_id
def _extract_planner_continuation_id(self, response_text: str) -> Optional[str]:
"""Extract continuation_id from planner response"""
try:
# Parse the response
response_data = json.loads(response_text)
return response_data.get("continuation_id")
except json.JSONDecodeError as e:
self.logger.debug(f"Failed to parse response for planner continuation_id: {e}")
return None
def _parse_planner_response(self, response_text: str) -> dict:
"""Parse planner tool JSON response"""
try:
# Parse the response - it should be direct JSON
return json.loads(response_text)
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse planner response as JSON: {e}")
self.logger.error(f"Response text: {response_text[:500]}...")
return {}
def _validate_step_response(
self,
response_data: dict,
expected_step: int,
expected_total: int,
expected_next_required: bool,
expected_status: str,
) -> bool:
"""Validate a planner step response structure"""
try:
# Check status
if response_data.get("status") != expected_status:
self.logger.error(f"Expected status '{expected_status}', got '{response_data.get('status')}'")
return False
# Check step number
if response_data.get("step_number") != expected_step:
self.logger.error(f"Expected step_number {expected_step}, got {response_data.get('step_number')}")
return False
# Check total steps
if response_data.get("total_steps") != expected_total:
self.logger.error(f"Expected total_steps {expected_total}, got {response_data.get('total_steps')}")
return False
# Check next_step_required
if response_data.get("next_step_required") != expected_next_required:
self.logger.error(
f"Expected next_step_required {expected_next_required}, got {response_data.get('next_step_required')}"
)
return False
# Check step_content exists
if not response_data.get("step_content"):
self.logger.error("Missing step_content in response")
return False
# Check next_steps guidance
if not response_data.get("next_steps"):
self.logger.error("Missing next_steps guidance in response")
return False
return True
except Exception as e:
self.logger.error(f"Error validating step response: {e}")
return False