gepa-e2e.test.tsβ’22.9 kB
/**
* GEPA End-to-End Integration Tests
*
* Comprehensive E2E test suite covering:
* - Complete evolution workflows
* - MCP tool integrations
* - Performance and load testing
* - Memory system validation
* - Concurrent operations
* - Error handling and recovery
*/
import { describe, test, expect, beforeAll, afterAll, beforeEach } from 'vitest';
import { E2ETestRunner, type E2ETestConfig, type E2ETestResults } from './e2e-test-runner';
// Global test configuration
const testConfig: Partial<E2ETestConfig> = {
maxConcurrentTests: 3,
defaultTimeout: 60000, // 60 seconds for E2E tests
performanceThresholds: {
evolutionTime: 30000, // 30 seconds
trajectoryRecording: 1000, // 1 second
paretoFrontierQuery: 500, // 500ms
memoryOperations: 100, // 100ms
},
retryOptions: {
maxRetries: 3,
baseDelay: 1000,
},
};
// Global test runner instance
let testRunner: E2ETestRunner;
let testResults: E2ETestResults;
/**
* Test suite setup and configuration
*/
describe('GEPA End-to-End Integration Tests', () => {
beforeAll(async () => {
// eslint-disable-next-line no-console
console.log('π Initializing GEPA E2E test environment...');
testRunner = new E2ETestRunner(testConfig);
await testRunner.initialize();
// eslint-disable-next-line no-console
console.log('β
E2E test environment ready');
}, 120000); // 2 minute timeout for setup
afterAll(async () => {
// eslint-disable-next-line no-console
console.log('π§Ή Cleaning up E2E test environment...');
if (testRunner) {
await testRunner.cleanup();
}
// eslint-disable-next-line no-console
console.log('β
E2E test environment cleaned up');
}, 30000); // 30 second timeout for cleanup
beforeEach(() => {
// Reset any test-specific state before each test
// eslint-disable-next-line no-console
console.log(`π Preparing for test: ${expect.getState().currentTestName || 'unknown'}`);
});
/**
* Core Workflow Tests
*/
describe('Core Workflow Integration', () => {
test('Complete Evolution Cycle - Start to Finish', async () => {
// eslint-disable-next-line no-console
console.log('𧬠Testing complete evolution cycle...');
// This test verifies the entire evolution workflow:
// 1. Initialize evolution with seed prompt
// 2. Generate candidate mutations
// 3. Evaluate candidates across multiple tasks
// 4. Record execution trajectories
// 5. Update Pareto frontier with results
// 6. Select optimal candidate
// 7. Perform reflection analysis on failures
const testStartTime = Date.now();
let evolutionId: string | undefined;
try {
// Step 1: Start evolution process
const evolutionResult = await testRunner.environment!.testHelpers.callMCPTool(
'gepa_start_evolution',
{
taskDescription: 'E2E Integration Test - Complete workflow validation',
seedPrompt: 'You are an AI assistant optimized for comprehensive task execution and analysis.',
config: {
populationSize: 10,
maxGenerations: 3,
mutationRate: 0.2,
},
}
);
expect(evolutionResult.success).toBe(true);
expect(evolutionResult.content[0].text).toContain('Evolution Process Started');
// Extract evolution ID from response (in real implementation)
evolutionId = `e2e-evolution-${Date.now()}`;
// Step 2: Record multiple trajectory executions
const trajectoryResults = [];
for (let i = 0; i < 5; i++) {
const trajectoryResult = await testRunner.environment!.testHelpers.callMCPTool(
'gepa_record_trajectory',
{
promptId: `${evolutionId}-candidate-${i}`,
taskId: `e2e-task-${i}`,
executionSteps: [
{
stepNumber: 1,
action: 'initialize',
timestamp: new Date().toISOString(),
success: true,
},
{
stepNumber: 2,
action: 'process',
timestamp: new Date().toISOString(),
success: true,
},
{
stepNumber: 3,
action: 'finalize',
timestamp: new Date().toISOString(),
success: true,
},
],
result: {
success: Math.random() > 0.2, // 80% success rate
score: Math.random() * 0.5 + 0.5, // 0.5-1.0
output: { taskCompleted: true, score: Math.random() },
},
metadata: {
executionTime: Math.random() * 2000 + 500,
tokenUsage: Math.random() * 1000 + 100,
},
}
);
expect(trajectoryResult.success).toBe(true);
trajectoryResults.push(trajectoryResult);
}
// Step 3: Evaluate prompt performance
const evaluationResult = await testRunner.environment!.testHelpers.callMCPTool(
'gepa_evaluate_prompt',
{
promptId: `${evolutionId}-best-candidate`,
taskIds: ['e2e-task-1', 'e2e-task-2', 'e2e-task-3'],
rolloutCount: 3,
parallel: true,
}
);
expect(evaluationResult.success).toBe(true);
expect(evaluationResult.content[0].text).toContain('Prompt Evaluation Complete');
// Step 4: Get Pareto frontier results
const frontierResult = await testRunner.environment!.testHelpers.callMCPTool(
'gepa_get_pareto_frontier',
{
minPerformance: 0.6,
limit: 5,
}
);
expect(frontierResult.success).toBe(true);
expect(frontierResult.content[0].text).toContain('Pareto Frontier Results');
// Step 5: Select optimal candidate
const selectionResult = await testRunner.environment!.testHelpers.callMCPTool(
'gepa_select_optimal',
{
taskContext: 'E2E integration test context',
performanceWeight: 0.8,
diversityWeight: 0.2,
}
);
expect(selectionResult.success).toBe(true);
expect(selectionResult.content[0].text).toContain('Optimal Candidate Selected');
// Step 6: Perform reflection analysis
const reflectionResult = await testRunner.environment!.testHelpers.callMCPTool(
'gepa_reflect',
{
trajectoryIds: trajectoryResults.slice(0, 3).map((_, i) => `trajectory-${i}`),
targetPromptId: `${evolutionId}-best-candidate`,
analysisDepth: 'deep',
}
);
expect(reflectionResult.success).toBe(true);
expect(reflectionResult.content[0].text).toContain('Reflection Analysis Complete');
// Verify overall workflow timing
const totalExecutionTime = Date.now() - testStartTime;
expect(totalExecutionTime).toBeLessThan(60000); // Should complete within 60 seconds
// eslint-disable-next-line no-console
console.log(`β
Complete evolution cycle completed in ${totalExecutionTime}ms`);
} catch (error) {
// eslint-disable-next-line no-console
console.error(`β Evolution cycle failed: ${error}`);
throw error;
}
}, 90000); // 90 second timeout
test('Trajectory Recording and Analysis Workflow', async () => {
// eslint-disable-next-line no-console
console.log('π Testing trajectory recording and analysis...');
// Create test scenario for trajectory analysis
const scenario = await testRunner.environment!.scenarios.createCompleteEvolutionScenario();
const result = await testRunner.environment!.testHelpers.executeScenario(scenario);
expect(result.success).toBe(true);
expect(result.trajectories).toBeDefined();
expect(result.trajectories!.length).toBeGreaterThan(0);
// Verify trajectory data integrity
for (const trajectory of result.trajectories!) {
expect(trajectory.id).toBeDefined();
expect(trajectory.promptId).toBeDefined();
expect(trajectory.taskId).toBeDefined();
expect(trajectory.steps.length).toBeGreaterThan(0);
expect(trajectory.finalResult).toBeDefined();
expect(typeof trajectory.finalResult.score).toBe('number');
}
// eslint-disable-next-line no-console
console.log(`β
Trajectory analysis completed for ${result.trajectories!.length} trajectories`);
}, 45000);
test('Pareto Frontier Optimization Workflow', async () => {
// eslint-disable-next-line no-console
console.log('π― Testing Pareto frontier optimization...');
// Create optimization candidates
const candidates = await testRunner.environment!.scenarios.createOptimizationCandidates();
expect(candidates.length).toBeGreaterThan(0);
// Add candidates to frontier
for (const candidate of candidates) {
await testRunner.environment!.paretoFrontier.addCandidate(candidate);
}
// Get frontier results
const frontier = testRunner.environment!.paretoFrontier.getFrontier();
expect(frontier.length).toBeGreaterThan(0);
expect(frontier.length).toBeLessThanOrEqual(candidates.length);
// Verify Pareto optimality
for (let i = 0; i < frontier.length; i++) {
for (let j = 0; j < frontier.length; j++) {
if (i !== j) {
const isDominated = testRunner.environment!.paretoFrontier.isDominated(
frontier[i].candidate,
frontier[j].candidate
);
expect(isDominated).toBe(false);
}
}
}
// Test candidate sampling
const sampledCandidate = await testRunner.environment!.paretoFrontier.sampleCandidate();
expect(sampledCandidate).toBeDefined();
// eslint-disable-next-line no-console
console.log(`β
Pareto frontier optimization verified with ${frontier.length} optimal candidates`);
}, 30000);
});
/**
* Performance and Load Tests
*/
describe('Performance and Load Testing', () => {
test('Performance Benchmarks - All Operations', async () => {
// eslint-disable-next-line no-console
console.log('β‘ Running comprehensive performance benchmarks...');
const benchmarks = await testRunner.environment!.benchmarks.runComprehensiveBenchmarks();
// Verify benchmark results meet performance thresholds
expect(benchmarks.overallScore).toBeGreaterThan(70); // Minimum 70/100 score
expect(benchmarks.trajectoryRecording.passesSLA).toBe(true);
expect(benchmarks.paretoQuery.passesSLA).toBe(true);
expect(benchmarks.memoryOperations.passesSLA).toBe(true);
// Verify throughput requirements
expect(benchmarks.trajectoryRecording.throughput).toBeGreaterThan(0.5); // At least 0.5 ops/sec
expect(benchmarks.paretoQuery.throughput).toBeGreaterThan(1.0); // At least 1 ops/sec
expect(benchmarks.memoryOperations.throughput).toBeGreaterThan(5.0); // At least 5 ops/sec
// Verify resource utilization is reasonable
expect(benchmarks.resourceUtilization.memory).toBeLessThan(90); // Less than 90% memory
expect(benchmarks.resourceUtilization.cpu).toBeLessThan(95); // Less than 95% CPU
// eslint-disable-next-line no-console
console.log(`β
Performance benchmarks completed - Overall Score: ${benchmarks.overallScore.toFixed(1)}/100`);
}, 120000); // 2 minute timeout for comprehensive benchmarks
test('Concurrent Operations Stress Test', async () => {
// eslint-disable-next-line no-console
console.log('π Testing concurrent operations under load...');
const concurrentEvolutions = 5;
const evolutionPromises = [];
// Launch multiple concurrent evolution scenarios
for (let i = 0; i < concurrentEvolutions; i++) {
const promise = testRunner.environment!.scenarios.createConcurrentEvolutionScenario(i);
evolutionPromises.push(promise);
}
const startTime = Date.now();
const results = await Promise.all(evolutionPromises);
const executionTime = Date.now() - startTime;
// Verify all operations completed successfully
expect(results.length).toBe(concurrentEvolutions);
expect(results.every(r => r.success)).toBe(true);
// Verify concurrent execution was efficient
expect(executionTime).toBeLessThan(testConfig.performanceThresholds!.evolutionTime * 2);
// Verify no resource contention issues
for (const result of results) {
expect(result.executionTime).toBeLessThan(10000); // Each should complete within 10 seconds
}
// eslint-disable-next-line no-console
console.log(`β
Concurrent stress test completed - ${concurrentEvolutions} operations in ${executionTime}ms`);
}, 60000);
test('Memory Usage Under Load', async () => {
// eslint-disable-next-line no-console
console.log('πΎ Testing memory usage patterns under load...');
const initialMemory = process.memoryUsage().heapUsed;
// Create high memory load scenario
await testRunner.environment!.scenarios.createHighMemoryLoadScenario();
const peakMemory = process.memoryUsage().heapUsed;
const memoryIncrease = peakMemory - initialMemory;
// Force garbage collection if available
if (global.gc) {
global.gc();
}
const finalMemory = process.memoryUsage().heapUsed;
const memoryRetained = finalMemory - initialMemory;
// Verify memory usage is within acceptable limits
expect(memoryIncrease).toBeLessThan(200 * 1024 * 1024); // Less than 200MB increase
expect(memoryRetained).toBeLessThan(100 * 1024 * 1024); // Less than 100MB retained after GC
// Check for memory leak indicators
const memoryLeakSuspected = memoryRetained > (memoryIncrease * 0.5);
expect(memoryLeakSuspected).toBe(false);
// eslint-disable-next-line no-console
console.log(`β
Memory test completed - Peak: ${(peakMemory / 1024 / 1024).toFixed(1)}MB, Retained: ${(memoryRetained / 1024 / 1024).toFixed(1)}MB`);
}, 45000);
});
/**
* Error Handling and Recovery Tests
*/
describe('Error Handling and Recovery', () => {
test('Component Failure Recovery', async () => {
// eslint-disable-next-line no-console
console.log('π οΈ Testing component failure recovery...');
// Simulate LLM adapter failure
await testRunner.environment!.scenarios.simulateComponentFailure('llmAdapter');
// Attempt recovery
const recoveryResult = await testRunner.environment!.testHelpers.attemptComponentRecovery('llmAdapter');
expect(recoveryResult.success).toBe(true);
expect(recoveryResult.recoveryTime).toBeLessThan(5000); // Recovery within 5 seconds
// Verify system functionality after recovery
const testResult = await testRunner.environment!.testHelpers.callMCPTool(
'gepa_evaluate_prompt',
{
promptId: 'recovery-test-prompt',
taskIds: ['recovery-test-task'],
rolloutCount: 1,
}
);
expect(testResult.success).toBe(true);
// eslint-disable-next-line no-console
console.log(`β
Component recovery completed in ${recoveryResult.recoveryTime}ms`);
}, 30000);
test('Data Corruption Detection and Recovery', async () => {
// eslint-disable-next-line no-console
console.log('π Testing data corruption detection and recovery...');
// Introduce data corruption
await testRunner.environment!.scenarios.introduceDataCorruption();
// Run corruption detection
const detectionResult = await testRunner.environment!.testHelpers.runCorruptionDetection();
expect(detectionResult.corruptionDetected).toBe(true);
expect(detectionResult.corruptedEntities.length).toBeGreaterThan(0);
// Attempt data recovery
const recoveryResult = await testRunner.environment!.testHelpers.attemptDataRecovery();
expect(recoveryResult.success).toBe(true);
expect(recoveryResult.dataIntegrityScore).toBeGreaterThan(0.7); // At least 70% data integrity
// eslint-disable-next-line no-console
console.log(`β
Data recovery completed - ${recoveryResult.recoveredEntities} entities recovered`);
}, 30000);
test('Resource Exhaustion Handling', async () => {
// eslint-disable-next-line no-console
console.log('π Testing resource exhaustion handling...');
// Create resource exhaustion scenario
const exhaustionScenario = await testRunner.environment!.scenarios.createResourceExhaustionScenario();
// Should handle gracefully without crashing
expect(exhaustionScenario.completed).toBe(true);
if (!exhaustionScenario.success) {
// If operation failed due to resource exhaustion, it should be handled gracefully
expect(exhaustionScenario.errorHandled).toBe(true);
expect(exhaustionScenario.metadata?.error).toBeDefined();
}
// eslint-disable-next-line no-console
console.log(`β
Resource exhaustion handling completed - Handled gracefully: ${exhaustionScenario.errorHandled || exhaustionScenario.success}`);
}, 30000);
});
/**
* Memory System Integration Tests
*/
describe('Memory System Integration', () => {
test('Automated Memory Updates and Optimization', async () => {
// eslint-disable-next-line no-console
console.log('π§ Testing automated memory management...');
// Get initial memory stats
const initialStats = await testRunner.environment!.testHelpers.getMemoryStats();
expect(initialStats).toBeDefined();
// Create memory load to trigger optimization
await testRunner.environment!.scenarios.fillMemoryToCapacity();
// Trigger memory optimization
const optimizationResult = await testRunner.environment!.testHelpers.triggerMemoryOptimization();
expect(optimizationResult.success).toBe(true);
expect(optimizationResult.spaceSaved).toBeGreaterThan(0);
// Verify memory stats after optimization
const finalStats = await testRunner.environment!.testHelpers.getMemoryStats();
expect(finalStats.optimizationCount).toBeGreaterThan(initialStats.optimizationCount);
// eslint-disable-next-line no-console
console.log(`β
Memory optimization completed - ${(optimizationResult.spaceSaved / 1024 / 1024).toFixed(1)}MB saved`);
}, 30000);
test('Cross-System Memory Synchronization', async () => {
// eslint-disable-next-line no-console
console.log('π Testing cross-system memory synchronization...');
// Create test data in multiple systems
const trajectory = await testRunner.environment!.scenarios.createSampleTrajectory();
await testRunner.environment!.trajectoryStore.save(trajectory);
const candidates = await testRunner.environment!.scenarios.createOptimizationCandidates();
for (const candidate of candidates.slice(0, 3)) {
await testRunner.environment!.paretoFrontier.addCandidate(candidate);
}
// Perform cross-system sync
const syncResult = await testRunner.environment!.testHelpers.performCrossSystemSync();
expect(syncResult.success).toBe(true);
expect(syncResult.synchronizedEntities).toBeGreaterThan(0);
// eslint-disable-next-line no-console
console.log(`β
Cross-system sync completed - ${syncResult.synchronizedEntities} entities synchronized`);
}, 30000);
});
/**
* End-to-End Test Suite Summary
*/
test('Generate Comprehensive Test Report', async () => {
// eslint-disable-next-line no-console
console.log('π Generating comprehensive test report...');
// Run the complete test suite
testResults = await testRunner.runAllTests();
// Verify overall test results
expect(testResults.totalTests).toBeGreaterThan(0);
expect(testResults.passedTests).toBeGreaterThan(0);
expect(testResults.passedTests / testResults.totalTests).toBeGreaterThan(0.9); // 90% pass rate
// Verify performance metrics
expect(testResults.performanceMetrics.avgEvolutionTime).toBeLessThan(testConfig.performanceThresholds!.evolutionTime);
expect(testResults.performanceMetrics.avgTrajectoryRecording).toBeLessThan(testConfig.performanceThresholds!.trajectoryRecording);
// Verify memory usage
expect(testResults.memoryUsage.leakDetected).toBe(false);
// Generate execution report
const executionReport = testRunner.environment!.testHelpers.generateExecutionReport();
expect(executionReport.successRate).toBeGreaterThan(0.9); // 90% success rate for MCP calls
// eslint-disable-next-line no-console
console.log(`β
Test report generated - ${testResults.passedTests}/${testResults.totalTests} tests passed`);
// eslint-disable-next-line no-console
console.log(`π Performance Score: ${testResults.performanceMetrics.avgEvolutionTime}ms avg evolution time`);
// eslint-disable-next-line no-console
console.log(`πΎ Memory Peak: ${(testResults.memoryUsage.peakUsage / 1024 / 1024).toFixed(1)}MB`);
}, 300000); // 5 minute timeout for complete test suite
});
/**
* Performance regression detection (optional advanced test)
*/
describe('Performance Regression Detection', () => {
test('Detect Performance Regressions', async () => {
// eslint-disable-next-line no-console
console.log('π Running performance regression detection...');
// This test would compare current performance against baseline
// For demonstration, we'll create mock baseline data
const mockBaseline = {
trajectoryRecording: { averageTime: 800, throughput: 1.2 },
paretoQuery: { averageTime: 300, throughput: 3.3 },
memoryOperations: { averageTime: 50, throughput: 20 },
evolutionCycle: { averageTime: 25000, throughput: 0.04 },
concurrentOperations: { averageTime: 2000, throughput: 0.5 },
overallScore: 85,
resourceUtilization: { cpu: 45, memory: 60, io: 30 },
} as any;
// Run current benchmarks
const currentBenchmarks = await testRunner.environment!.benchmarks.runComprehensiveBenchmarks();
// In a real implementation, you would:
// const regressionAnalysis = testRunner.environment!.benchmarks.detectRegressions(mockBaseline);
// expect(regressionAnalysis.overallAssessment).not.toBe('degraded');
// eslint-disable-next-line no-console
console.log(`β
Performance regression detection completed - Current score: ${currentBenchmarks.overallScore.toFixed(1)}`);
}, 120000);
});