reflection-engine.test.ts•38.8 kB
/**
* Comprehensive tests for ReflectionEngine
* Tests trajectory analysis, failure pattern classification, root cause analysis,
* module responsibility attribution, improvement suggestion generation, and confidence scoring
*
* NOTE: These tests are written BEFORE implementation (Test-First Development)
* All tests should FAIL initially until ReflectionEngine is implemented
*/
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
import type {
ExecutionTrajectory,
ReflectionAnalysis,
LLMResponse,
} from '../types/gepa';
import {
sampleTrajectories,
TestDataGenerator
} from '../test/fixtures/sample-data';
// Mock LLMAdapter for testing
const mockLLMAdapter = {
callLLM: vi.fn(),
analyzeTrajectory: vi.fn(),
generateMutation: vi.fn(),
};
// Import the ReflectionEngine class that we need to implement
// This import will FAIL until the implementation exists
import { ReflectionEngine } from './reflection-engine';
// Helper function for creating mock analysis with specific confidence
function mockAnalysisWithConfidence(confidence: number): ReflectionAnalysis {
return {
trajectoryId: 'test-id',
promptId: 'test-prompt',
diagnosis: {
moduleResponsibility: new Map(),
patterns: [],
},
suggestions: [],
confidence,
rationale: `Analysis with ${confidence} confidence`,
};
}
// Helper function for creating mock batch response
function mockBatchResponse(patterns: string[], confidence: number): LLMResponse {
return {
content: JSON.stringify({
commonPatterns: patterns.map(p => ({
type: p,
frequency: 1,
description: `Pattern ${p}`,
examples: [`Example for ${p}`],
})),
recommendations: [],
overallConfidence: confidence,
}),
model: 'claude-test',
tokens: { prompt: 300, completion: 150, total: 450 },
finishReason: 'stop' as const,
latency: 1000,
timestamp: new Date(),
} as LLMResponse;
}
describe('ReflectionEngine', () => {
let engine: ReflectionEngine;
let mockTrajectoryStore: any;
beforeEach(() => {
vi.clearAllMocks();
// Mock trajectory store
mockTrajectoryStore = {
query: vi.fn(),
load: vi.fn(),
save: vi.fn(),
};
// Initialize engine with mocked dependencies
engine = new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: mockTrajectoryStore,
config: {
maxAnalysisDepth: 10,
confidenceThreshold: 0.7,
patternMinFrequency: 1,
batchSize: 5,
},
});
});
afterEach(() => {
vi.restoreAllMocks();
});
describe('Constructor and Initialization', () => {
it('should create engine with default configuration', () => {
const defaultEngine = new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: mockTrajectoryStore,
});
expect(defaultEngine).toBeInstanceOf(ReflectionEngine);
expect(defaultEngine.config.maxAnalysisDepth).toBe(5);
expect(defaultEngine.config.confidenceThreshold).toBe(0.8);
expect(defaultEngine.config.patternMinFrequency).toBe(3);
expect(defaultEngine.config.batchSize).toBe(10);
});
it('should create engine with custom configuration', () => {
const customConfig = {
maxAnalysisDepth: 15,
confidenceThreshold: 0.9,
patternMinFrequency: 1,
batchSize: 3,
enableCaching: true,
cacheTimeout: 3600000,
};
const customEngine = new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: mockTrajectoryStore,
config: customConfig,
});
expect(customEngine.config).toEqual(expect.objectContaining(customConfig));
});
it('should validate required dependencies', () => {
expect(() => {
new ReflectionEngine({
llmAdapter: null as any,
trajectoryStore: mockTrajectoryStore,
});
}).toThrow('LLM adapter is required');
expect(() => {
new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: null as any,
});
}).toThrow('Trajectory store is required');
});
});
describe('Single Trajectory Analysis', () => {
let failedTrajectory: ExecutionTrajectory;
let successfulTrajectory: ExecutionTrajectory;
beforeEach(() => {
failedTrajectory = TestDataGenerator.generateExecutionTrajectory({
id: 'failed-traj-001',
promptId: 'prompt-001',
taskId: 'task-001',
finalResult: {
success: false,
score: 0.2,
output: 'Task failed due to missing context',
error: 'Context validation failed',
},
steps: [
{
stepNumber: 1,
action: 'analyze_requirements',
reasoning: 'Analyzing task requirements',
toolName: 'requirement_analyzer',
toolInput: { task: 'complex_analysis' },
toolOutput: { status: 'partial' },
timestamp: new Date(),
},
{
stepNumber: 2,
action: 'validate_context',
reasoning: 'Validating execution context',
toolName: 'context_validator',
toolInput: { context: 'minimal' },
error: 'Insufficient context provided',
timestamp: new Date(),
},
],
});
successfulTrajectory = TestDataGenerator.generateExecutionTrajectory({
id: 'success-traj-001',
promptId: 'prompt-002',
taskId: 'task-002',
finalResult: {
success: true,
score: 0.9,
output: 'Task completed successfully',
},
});
// Mock LLM response for trajectory analysis
mockLLMAdapter.analyzeTrajectory.mockResolvedValue({
trajectoryId: failedTrajectory.id,
promptId: failedTrajectory.promptId,
diagnosis: {
failurePoint: 'Step 2: Context validation',
rootCause: 'Insufficient context information provided to the system',
moduleResponsibility: new Map([
['context_validator', 0.8],
['requirement_analyzer', 0.2],
]),
patterns: [
{
type: 'context_insufficient',
frequency: 1,
description: 'System failed due to lack of sufficient context',
examples: ['Context validation failed'],
},
],
},
suggestions: [
{
type: 'add_instruction',
targetSection: 'context_requirements',
proposedChange: 'Add explicit context gathering step before validation',
rationale: 'Ensures sufficient context is available for validation',
expectedImpact: 0.7,
},
],
confidence: 0.85,
rationale: 'Clear failure pattern with specific error location',
} as ReflectionAnalysis);
});
it('should analyze single failed trajectory', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis).toBeDefined();
expect(analysis.trajectoryId).toBe(failedTrajectory.id);
expect(analysis.promptId).toBe(failedTrajectory.promptId);
expect(analysis.diagnosis.failurePoint).toContain('Context validation');
expect(analysis.diagnosis.rootCause).toContain('Insufficient context');
expect(analysis.suggestions).toHaveLength(1);
expect(analysis.confidence).toBeGreaterThan(0.8);
expect(mockLLMAdapter.analyzeTrajectory).toHaveBeenCalledWith(
failedTrajectory,
expect.any(String)
);
});
it('should identify failure point accurately', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis.diagnosis.failurePoint).toBe('Step 2: Context validation');
expect(analysis.diagnosis.failurePoint).toContain('Step 2');
});
it('should perform root cause analysis', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis.diagnosis.rootCause).toBeDefined();
expect(analysis.diagnosis.rootCause).toContain('context');
expect(analysis.diagnosis.rootCause.length).toBeGreaterThan(10);
});
it('should attribute module responsibility', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis.diagnosis.moduleResponsibility).toBeInstanceOf(Map);
expect(analysis.diagnosis.moduleResponsibility.size).toBeGreaterThan(0);
expect(analysis.diagnosis.moduleResponsibility.get('context_validator')).toBe(0.8);
expect(analysis.diagnosis.moduleResponsibility.get('requirement_analyzer')).toBe(0.2);
});
it('should classify failure patterns', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis.diagnosis.patterns).toHaveLength(1);
const pattern = analysis.diagnosis.patterns[0];
expect(pattern.type).toBe('context_insufficient');
expect(pattern.frequency).toBe(1);
expect(pattern.description).toContain('context');
expect(pattern.examples).toContain('Context validation failed');
});
it('should generate improvement suggestions', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis.suggestions).toHaveLength(1);
const suggestion = analysis.suggestions[0];
expect(suggestion.type).toBe('add_instruction');
expect(suggestion.targetSection).toBe('context_requirements');
expect(suggestion.proposedChange).toContain('context gathering');
expect(suggestion.rationale).toContain('validation');
expect(suggestion.expectedImpact).toBe(0.7);
});
it('should calculate confidence score', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis.confidence).toBe(0.85);
expect(analysis.confidence).toBeGreaterThan(0);
expect(analysis.confidence).toBeLessThanOrEqual(1);
});
it('should provide analysis rationale', async () => {
const analysis = await engine.analyzeTrajectory(failedTrajectory);
expect(analysis.rationale).toBeDefined();
expect(analysis.rationale).toContain('failure pattern');
expect(analysis.rationale.length).toBeGreaterThan(10);
});
it('should handle successful trajectories differently', async () => {
mockLLMAdapter.analyzeTrajectory.mockResolvedValue({
trajectoryId: successfulTrajectory.id,
promptId: successfulTrajectory.promptId,
diagnosis: {
moduleResponsibility: new Map(),
patterns: [],
},
suggestions: [],
confidence: 0.95,
rationale: 'Successful execution with high performance',
} as ReflectionAnalysis);
const analysis = await engine.analyzeTrajectory(successfulTrajectory);
expect(analysis.suggestions).toHaveLength(0);
expect(analysis.diagnosis.patterns).toHaveLength(0);
expect(analysis.confidence).toBeGreaterThan(0.9);
});
it('should handle LLM analysis errors gracefully', async () => {
mockLLMAdapter.analyzeTrajectory.mockRejectedValue(new Error('LLM service unavailable'));
await expect(engine.analyzeTrajectory(failedTrajectory)).rejects.toThrow(
'Failed to analyze trajectory: LLM service unavailable'
);
});
});
describe('Batch Trajectory Analysis', () => {
let trajectories: ExecutionTrajectory[];
beforeEach(() => {
trajectories = [
TestDataGenerator.generateExecutionTrajectory({
id: 'batch-traj-001',
promptId: 'prompt-001',
taskId: 'task-001',
finalResult: { success: false, score: 0.3, output: 'Context error' },
}),
TestDataGenerator.generateExecutionTrajectory({
id: 'batch-traj-002',
promptId: 'prompt-001',
taskId: 'task-002',
finalResult: { success: false, score: 0.25, output: 'Validation error' },
}),
TestDataGenerator.generateExecutionTrajectory({
id: 'batch-traj-003',
promptId: 'prompt-001',
taskId: 'task-003',
finalResult: { success: true, score: 0.9, output: 'Success' },
}),
];
// Mock batch analysis response
mockLLMAdapter.callLLM.mockResolvedValue({
content: JSON.stringify({
commonPatterns: [
{
type: 'context_validation_failure',
frequency: 2,
description: 'Multiple failures due to context validation issues',
examples: ['Context error', 'Validation error'],
trajectoryIds: ['batch-traj-001', 'batch-traj-002'],
},
],
recommendations: [
{
priority: 'high',
type: 'add_instruction',
targetSection: 'validation',
proposedChange: 'Add pre-validation context checks',
rationale: 'Prevent context validation failures',
expectedImpact: 0.8,
affectedTrajectories: ['batch-traj-001', 'batch-traj-002'],
},
],
overallConfidence: 0.82,
}),
model: 'claude-test',
tokens: { prompt: 500, completion: 200, total: 700 },
finishReason: 'stop' as const,
latency: 1500,
timestamp: new Date(),
} as LLMResponse);
});
it('should analyze multiple trajectories in batch', async () => {
const analysis = await engine.analyzeBatch(trajectories);
expect(analysis).toBeDefined();
expect(analysis.trajectoryIds).toEqual(trajectories.map(t => t.id));
expect(analysis.commonPatterns).toHaveLength(1);
expect(analysis.recommendations).toHaveLength(1);
expect(analysis.overallConfidence).toBe(0.82);
});
it('should identify common failure patterns across trajectories', async () => {
const analysis = await engine.analyzeBatch(trajectories);
const pattern = analysis.commonPatterns[0];
expect(pattern.type).toBe('context_validation_failure');
expect(pattern.frequency).toBe(2);
expect(pattern.trajectoryIds).toEqual(['batch-traj-001', 'batch-traj-002']);
expect(pattern.examples).toContain('Context error');
expect(pattern.examples).toContain('Validation error');
});
it('should generate prioritized recommendations', async () => {
const analysis = await engine.analyzeBatch(trajectories);
const recommendation = analysis.recommendations[0];
expect(recommendation.priority).toBe('high');
expect(recommendation.type).toBe('add_instruction');
expect(recommendation.expectedImpact).toBe(0.8);
expect(recommendation.affectedTrajectories).toHaveLength(2);
});
it('should handle empty trajectory list', async () => {
await expect(engine.analyzeBatch([])).rejects.toThrow(
'At least one trajectory is required for batch analysis'
);
});
it('should respect batch size configuration', async () => {
const largeBatch = TestDataGenerator.generateExecutionTrajectories(20, {});
// Engine configured with batchSize: 5
await engine.analyzeBatch(largeBatch);
// Should process in chunks of 5, so 4 LLM calls
expect(mockLLMAdapter.callLLM).toHaveBeenCalledTimes(4);
});
it('should aggregate results from multiple batches', async () => {
const largeBatch = Array.from({ length: 12 }, (_, i) =>
TestDataGenerator.generateExecutionTrajectory({ id: `large-batch-${i}` })
);
// Mock multiple batch responses
mockLLMAdapter.callLLM
.mockResolvedValueOnce(mockBatchResponse(['pattern1'], 0.8))
.mockResolvedValueOnce(mockBatchResponse(['pattern2'], 0.75))
.mockResolvedValueOnce(mockBatchResponse(['pattern1', 'pattern3'], 0.85));
const analysis = await engine.analyzeBatch(largeBatch);
expect(analysis.commonPatterns.length).toBeGreaterThan(1);
expect(analysis.overallConfidence).toBeCloseTo(0.8, 1); // Weighted average
});
});
describe('Pattern Classification and Aggregation', () => {
let multipleTrajectories: ExecutionTrajectory[];
beforeEach(() => {
multipleTrajectories = [
TestDataGenerator.generateExecutionTrajectory({
id: 'pattern-traj-001',
steps: [
{
stepNumber: 1,
action: 'validate_input',
error: 'Invalid input format',
timestamp: new Date(),
},
],
finalResult: { success: false, score: 0.1, output: 'Input validation failed' },
}),
TestDataGenerator.generateExecutionTrajectory({
id: 'pattern-traj-002',
steps: [
{
stepNumber: 1,
action: 'validate_input',
error: 'Missing required fields',
timestamp: new Date(),
},
],
finalResult: { success: false, score: 0.15, output: 'Validation error' },
}),
TestDataGenerator.generateExecutionTrajectory({
id: 'pattern-traj-003',
steps: [
{
stepNumber: 1,
action: 'process_data',
error: 'Timeout exceeded',
timestamp: new Date(),
},
],
finalResult: { success: false, score: 0.3, output: 'Processing timeout' },
}),
];
mockLLMAdapter.callLLM.mockResolvedValue({
content: JSON.stringify({
commonPatterns: [
{
type: 'input_validation_failure',
frequency: 2,
description: 'Input validation consistently fails',
examples: ['Invalid input format', 'Missing required fields'],
severity: 'high',
},
{
type: 'timeout_error',
frequency: 1,
description: 'Processing timeouts occur',
examples: ['Timeout exceeded'],
severity: 'medium',
},
],
recommendations: [],
overallConfidence: 0.88,
}),
model: 'claude-test',
tokens: { prompt: 400, completion: 180, total: 580 },
finishReason: 'stop' as const,
latency: 1200,
timestamp: new Date(),
} as LLMResponse);
});
it('should classify patterns by type and frequency', async () => {
const analysis = await engine.analyzeBatch(multipleTrajectories);
expect(analysis.commonPatterns).toHaveLength(2);
const validationPattern = analysis.commonPatterns.find(p => p.type === 'input_validation_failure');
expect(validationPattern).toBeDefined();
expect(validationPattern!.frequency).toBe(2);
expect(validationPattern!.examples).toHaveLength(2);
const timeoutPattern = analysis.commonPatterns.find(p => p.type === 'timeout_error');
expect(timeoutPattern).toBeDefined();
expect(timeoutPattern!.frequency).toBe(1);
});
it('should filter patterns by minimum frequency', async () => {
// Create engine with higher minimum frequency
const strictEngine = new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: mockTrajectoryStore,
config: { patternMinFrequency: 2 },
});
const analysis = await strictEngine.analyzeBatch(multipleTrajectories);
// Should only include patterns with frequency >= 2
const patterns = analysis.commonPatterns;
expect(patterns.every(p => p.frequency >= 2)).toBe(true);
});
it('should prioritize patterns by severity and frequency', async () => {
const analysis = await engine.analyzeBatch(multipleTrajectories);
const patterns = analysis.commonPatterns;
// Higher frequency pattern should come first
expect(patterns[0].frequency).toBeGreaterThanOrEqual(patterns[1].frequency);
});
it('should extract meaningful examples from trajectories', async () => {
const analysis = await engine.analyzeBatch(multipleTrajectories);
const validationPattern = analysis.commonPatterns.find(p => p.type === 'input_validation_failure');
expect(validationPattern!.examples).toContain('Invalid input format');
expect(validationPattern!.examples).toContain('Missing required fields');
});
});
describe('Improvement Suggestion Generation', () => {
let failureTrajectory: ExecutionTrajectory;
beforeEach(() => {
failureTrajectory = TestDataGenerator.generateExecutionTrajectory({
id: 'suggestion-traj-001',
promptId: 'prompt-suggestion-001',
taskId: 'task-suggestion-001',
steps: [
{
stepNumber: 1,
action: 'parse_requirements',
reasoning: 'Parsing user requirements',
toolName: 'requirement_parser',
toolInput: { text: 'ambiguous requirements' },
toolOutput: { parsed: false, confidence: 0.3 },
timestamp: new Date(),
},
{
stepNumber: 2,
action: 'generate_plan',
reasoning: 'Creating execution plan',
error: 'Cannot create plan with ambiguous requirements',
timestamp: new Date(),
},
],
finalResult: {
success: false,
score: 0.2,
output: 'Failed to generate execution plan',
error: 'Requirements too ambiguous',
},
});
mockLLMAdapter.analyzeTrajectory.mockResolvedValue({
trajectoryId: failureTrajectory.id,
promptId: failureTrajectory.promptId,
diagnosis: {
failurePoint: 'Step 1: Requirement parsing',
rootCause: 'Ambiguous requirements cannot be parsed effectively',
moduleResponsibility: new Map([['requirement_parser', 0.9]]),
patterns: [
{
type: 'ambiguous_requirements',
frequency: 1,
description: 'Requirements are too ambiguous for effective parsing',
examples: ['ambiguous requirements'],
},
],
},
suggestions: [
{
type: 'add_instruction',
targetSection: 'requirement_clarification',
proposedChange: 'Add step to request clarification for ambiguous requirements',
rationale: 'Clarification reduces ambiguity and improves parsing success',
expectedImpact: 0.8,
},
{
type: 'add_example',
targetSection: 'requirement_format',
proposedChange: 'Provide examples of well-formed requirements',
rationale: 'Examples guide users to improve requirements clarity',
expectedImpact: 0.6,
},
{
type: 'add_constraint',
targetSection: 'input_validation',
proposedChange: 'Require minimum confidence threshold for parsed requirements',
rationale: 'Prevents processing of poorly parsed requirements and improves quality',
expectedImpact: 0.7,
},
],
confidence: 0.85,
rationale: 'Clear failure pattern with specific improvement opportunities',
} as ReflectionAnalysis);
});
it('should generate multiple types of improvements', async () => {
const analysis = await engine.analyzeTrajectory(failureTrajectory);
expect(analysis.suggestions).toHaveLength(3);
const types = analysis.suggestions.map(s => s.type);
expect(types).toContain('add_instruction');
expect(types).toContain('add_example');
expect(types).toContain('add_constraint');
});
it('should provide specific target sections for improvements', async () => {
const analysis = await engine.analyzeTrajectory(failureTrajectory);
const targetSections = analysis.suggestions.map(s => s.targetSection);
expect(targetSections).toContain('requirement_clarification');
expect(targetSections).toContain('requirement_format');
expect(targetSections).toContain('input_validation');
});
it('should include detailed proposed changes', async () => {
const analysis = await engine.analyzeTrajectory(failureTrajectory);
const clarificationSuggestion = analysis.suggestions.find(
s => s.targetSection === 'requirement_clarification'
);
expect(clarificationSuggestion!.proposedChange).toContain('clarification');
expect(clarificationSuggestion!.proposedChange.length).toBeGreaterThan(20);
});
it('should provide rationale for each suggestion', async () => {
const analysis = await engine.analyzeTrajectory(failureTrajectory);
analysis.suggestions.forEach(suggestion => {
expect(suggestion.rationale).toBeDefined();
expect(suggestion.rationale.length).toBeGreaterThan(10);
expect(suggestion.rationale).toContain('improve');
});
});
it('should estimate impact of each suggestion', async () => {
const analysis = await engine.analyzeTrajectory(failureTrajectory);
analysis.suggestions.forEach(suggestion => {
expect(suggestion.expectedImpact).toBeGreaterThan(0);
expect(suggestion.expectedImpact).toBeLessThanOrEqual(1);
});
// Should have varying impact estimates
const impacts = analysis.suggestions.map(s => s.expectedImpact);
const uniqueImpacts = new Set(impacts);
expect(uniqueImpacts.size).toBeGreaterThan(1);
});
it('should prioritize suggestions by expected impact', async () => {
const analysis = await engine.analyzeTrajectory(failureTrajectory);
const sortedSuggestions = [...analysis.suggestions].sort((a, b) => b.expectedImpact - a.expectedImpact);
// First suggestion should have highest impact
expect(sortedSuggestions[0].expectedImpact).toBe(0.8);
expect(sortedSuggestions[0].type).toBe('add_instruction');
});
});
describe('Confidence Scoring', () => {
it('should calculate confidence based on trajectory completeness', async () => {
const completeTrajectory = TestDataGenerator.generateExecutionTrajectory({
id: 'complete-traj',
steps: Array.from({ length: 10 }, (_, i) => ({
stepNumber: i + 1,
action: `step_${i}`,
reasoning: `Detailed reasoning for step ${i}`,
toolName: `tool_${i}`,
toolInput: { data: `input_${i}` },
toolOutput: { result: `output_${i}` },
timestamp: new Date(),
})),
finalResult: { success: false, score: 0.3, output: 'Failed at step 10' },
});
const incompleteTrajectory = TestDataGenerator.generateExecutionTrajectory({
id: 'incomplete-traj',
steps: [
{
stepNumber: 1,
action: 'step_1',
timestamp: new Date(),
},
],
finalResult: { success: false, score: 0.1, output: 'Failed early' },
});
// Mock different confidence levels
mockLLMAdapter.analyzeTrajectory
.mockResolvedValueOnce(mockAnalysisWithConfidence(0.9))
.mockResolvedValueOnce(mockAnalysisWithConfidence(0.4));
const completeAnalysis = await engine.analyzeTrajectory(completeTrajectory);
const incompleteAnalysis = await engine.analyzeTrajectory(incompleteTrajectory);
expect(completeAnalysis.confidence).toBeGreaterThan(incompleteAnalysis.confidence);
expect(completeAnalysis.confidence).toBe(0.9);
expect(incompleteAnalysis.confidence).toBe(0.4);
});
it('should lower confidence for contradictory patterns', async () => {
const contradictoryTrajectories = [
TestDataGenerator.generateExecutionTrajectory({
id: 'contra-1',
finalResult: { success: false, score: 0.1, output: 'Failed due to X' },
}),
TestDataGenerator.generateExecutionTrajectory({
id: 'contra-2',
finalResult: { success: false, score: 0.9, output: 'Failed due to Y' },
}),
];
mockLLMAdapter.callLLM.mockResolvedValue({
content: JSON.stringify({
commonPatterns: [
{
type: 'contradictory_pattern',
frequency: 2,
description: 'Contradictory failure reasons',
examples: ['Failed due to X', 'Failed due to Y'],
},
],
recommendations: [],
overallConfidence: 0.3, // Low confidence due to contradictions
}),
model: 'claude-test',
tokens: { prompt: 200, completion: 100, total: 300 },
finishReason: 'stop' as const,
latency: 800,
timestamp: new Date(),
} as LLMResponse);
const analysis = await engine.analyzeBatch(contradictoryTrajectories);
expect(analysis.overallConfidence).toBeLessThan(0.5);
});
it('should increase confidence with consistent patterns', async () => {
const consistentTrajectories = Array.from({ length: 5 }, (_, i) =>
TestDataGenerator.generateExecutionTrajectory({
id: `consistent-${i}`,
finalResult: { success: false, score: 0.2, output: 'Same error type' },
})
);
mockLLMAdapter.callLLM.mockResolvedValue({
content: JSON.stringify({
commonPatterns: [
{
type: 'consistent_error',
frequency: 5,
description: 'Highly consistent error pattern',
examples: Array(5).fill('Same error type'),
},
],
recommendations: [],
overallConfidence: 0.95, // High confidence due to consistency
}),
model: 'claude-test',
tokens: { prompt: 300, completion: 150, total: 450 },
finishReason: 'stop' as const,
latency: 1000,
timestamp: new Date(),
} as LLMResponse);
const analysis = await engine.analyzeBatch(consistentTrajectories);
expect(analysis.overallConfidence).toBeGreaterThan(0.9);
});
});
describe('Edge Cases and Error Handling', () => {
it('should handle empty trajectory steps', async () => {
const emptyTrajectory = TestDataGenerator.generateExecutionTrajectory({
id: 'empty-traj',
steps: [],
finalResult: { success: false, score: 0, output: 'No steps executed' },
});
mockLLMAdapter.analyzeTrajectory.mockResolvedValue({
trajectoryId: emptyTrajectory.id,
promptId: emptyTrajectory.promptId,
diagnosis: {
failurePoint: 'No execution steps',
rootCause: 'Trajectory contains no execution steps',
moduleResponsibility: new Map(),
patterns: [
{
type: 'empty_execution',
frequency: 1,
description: 'No steps were executed',
examples: ['No steps executed'],
},
],
},
suggestions: [
{
type: 'add_instruction',
targetSection: 'execution_flow',
proposedChange: 'Ensure execution steps are properly generated',
rationale: 'Empty execution indicates flow control issues',
expectedImpact: 0.9,
},
],
confidence: 0.8,
rationale: 'Empty trajectory analysis',
} as ReflectionAnalysis);
const analysis = await engine.analyzeTrajectory(emptyTrajectory);
expect(analysis).toBeDefined();
expect(analysis.diagnosis.patterns[0].type).toBe('empty_execution');
expect(analysis.suggestions[0].proposedChange).toContain('execution steps');
});
it('should handle corrupted trajectory data', async () => {
const corruptedTrajectory = {
id: 'corrupted-traj',
promptId: null,
taskId: undefined,
steps: 'not-an-array',
finalResult: null,
} as any;
await expect(engine.analyzeTrajectory(corruptedTrajectory)).rejects.toThrow(
'Invalid trajectory data'
);
});
it('should handle LLM service timeouts', async () => {
const trajectory = sampleTrajectories[0];
mockLLMAdapter.analyzeTrajectory.mockRejectedValue(new Error('Request timeout'));
await expect(engine.analyzeTrajectory(trajectory)).rejects.toThrow(
'Failed to analyze trajectory: Request timeout'
);
});
it('should handle malformed LLM responses', async () => {
const trajectory = sampleTrajectories[0];
mockLLMAdapter.analyzeTrajectory.mockResolvedValue({
trajectoryId: trajectory.id,
// Missing required fields
} as any);
await expect(engine.analyzeTrajectory(trajectory)).rejects.toThrow(
'Invalid analysis response from LLM'
);
});
it('should handle very large trajectories', async () => {
const largeTrajectory = TestDataGenerator.generateExecutionTrajectory({
id: 'large-traj',
steps: Array.from({ length: 1000 }, (_, i) => ({
stepNumber: i + 1,
action: `action_${i}`,
reasoning: `Reasoning for step ${i}`.repeat(10),
timestamp: new Date(),
})),
});
mockLLMAdapter.analyzeTrajectory.mockResolvedValue(mockAnalysisWithConfidence(0.7));
const analysis = await engine.analyzeTrajectory(largeTrajectory);
expect(analysis).toBeDefined();
expect(analysis.confidence).toBe(0.7);
});
it('should validate confidence threshold', async () => {
const lowConfidenceEngine = new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: mockTrajectoryStore,
config: { confidenceThreshold: 0.9 },
});
const trajectory = sampleTrajectories[0];
mockLLMAdapter.analyzeTrajectory.mockResolvedValue(mockAnalysisWithConfidence(0.6));
await expect(lowConfidenceEngine.analyzeTrajectory(trajectory)).rejects.toThrow(
'Analysis confidence (0.6) below threshold (0.9)'
);
});
});
describe('Caching and Performance', () => {
beforeEach(() => {
// Create engine with caching enabled
engine = new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: mockTrajectoryStore,
config: {
enableCaching: true,
cacheTimeout: 3600000, // 1 hour
},
});
});
it('should cache analysis results', async () => {
const trajectory = sampleTrajectories[0];
mockLLMAdapter.analyzeTrajectory.mockResolvedValue(mockAnalysisWithConfidence(0.8));
// First call
const analysis1 = await engine.analyzeTrajectory(trajectory);
// Second call (should use cache)
const analysis2 = await engine.analyzeTrajectory(trajectory);
expect(analysis1).toEqual(analysis2);
expect(mockLLMAdapter.analyzeTrajectory).toHaveBeenCalledTimes(1);
});
it('should respect cache timeout', async () => {
const shortCacheEngine = new ReflectionEngine({
llmAdapter: mockLLMAdapter as any,
trajectoryStore: mockTrajectoryStore,
config: {
enableCaching: true,
cacheTimeout: 100, // 100ms
},
});
const trajectory = sampleTrajectories[0];
mockLLMAdapter.analyzeTrajectory.mockResolvedValue(mockAnalysisWithConfidence(0.8));
// First call
await shortCacheEngine.analyzeTrajectory(trajectory);
// Wait for cache to expire
await new Promise(resolve => setTimeout(resolve, 150));
// Second call (cache expired)
await shortCacheEngine.analyzeTrajectory(trajectory);
expect(mockLLMAdapter.analyzeTrajectory).toHaveBeenCalledTimes(2);
});
it('should handle batch analysis efficiently', async () => {
const trajectories = Array.from({ length: 100 }, (_, i) =>
TestDataGenerator.generateExecutionTrajectory({ id: `batch-${i}` })
);
const startTime = process.hrtime.bigint();
mockLLMAdapter.callLLM.mockResolvedValue({
content: JSON.stringify({
commonPatterns: [],
recommendations: [],
overallConfidence: 0.8,
}),
model: 'claude-test',
tokens: { prompt: 1000, completion: 500, total: 1500 },
finishReason: 'stop' as const,
latency: 2000,
timestamp: new Date(),
} as LLMResponse);
await engine.analyzeBatch(trajectories);
const endTime = process.hrtime.bigint();
const totalTime = Number(endTime - startTime) / 1_000_000; // Convert to milliseconds
// Should complete in reasonable time (less than 10 seconds for 100 trajectories)
expect(totalTime).toBeLessThan(10000);
}, 15000); // 15 second timeout for this test
});
describe('Integration with Trajectory Store', () => {
it('should query related trajectories for pattern analysis', async () => {
const promptId = 'test-prompt-123';
const relatedTrajectories = Array.from({ length: 3 }, (_, i) =>
TestDataGenerator.generateExecutionTrajectory({
id: `related-${i}`,
promptId,
})
);
mockTrajectoryStore.query.mockResolvedValue(relatedTrajectories);
// Mock LLM response for batch analysis
mockLLMAdapter.callLLM.mockResolvedValue({
content: JSON.stringify({
commonPatterns: [
{
type: 'test_pattern',
frequency: 2,
description: 'Test pattern description',
examples: ['example1', 'example2'],
},
],
recommendations: [],
overallConfidence: 0.8,
}),
model: 'claude-test',
tokens: { prompt: 200, completion: 100, total: 300 },
finishReason: 'stop' as const,
latency: 1000,
timestamp: new Date(),
} as LLMResponse);
const patterns = await engine.findPatternsForPrompt(promptId);
expect(mockTrajectoryStore.query).toHaveBeenCalledWith({
promptId,
limit: expect.any(Number),
});
expect(patterns).toBeDefined();
});
it('should handle trajectory store errors gracefully', async () => {
const promptId = 'test-prompt-123';
mockTrajectoryStore.query.mockRejectedValue(new Error('Database connection failed'));
await expect(engine.findPatternsForPrompt(promptId)).rejects.toThrow(
'Failed to query trajectories: Database connection failed'
);
});
});
});