isolated-capability-test-runner.shโข12.6 kB
#!/bin/bash
# Isolated Capability Index Test Runner
# Spawns separate Docker containers or processes for each test variation
# Ensures zero context contamination between tests
set -e
echo "๐ฌ Isolated Capability Index Test Runner"
echo "========================================"
echo ""
# Configuration
TEST_DIR="/tmp/isolated-capability-tests"
RESULTS_DIR="./test/experiments/results"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
# Ensure directories exist
mkdir -p "$TEST_DIR"
mkdir -p "$RESULTS_DIR"
# Clean previous test environment
rm -rf "$TEST_DIR"/*
# Create isolated test configurations
create_test_config() {
local test_id=$1
local test_name=$2
local config_file="$TEST_DIR/config_${test_id}.json"
cat > "$config_file" <<EOF
{
"test_id": "${test_id}",
"test_name": "${test_name}",
"isolated": true,
"timestamp": "${TIMESTAMP}"
}
EOF
}
# Create test prompt files for each variation
cat > "$TEST_DIR/variation_cascade_top.txt" <<'EOF'
CAPABILITY_TRIGGERS:
debug โ debug-detective
error โ debug-detective
git โ git-manager
issue โ github-issue-creator
When a trigger matches, respond: "CAPABILITY_SELECTED: [capability-name]"
EOF
cat > "$TEST_DIR/variation_cascade_bottom.txt" <<'EOF'
You are an AI assistant. Help users with their requests.
[Additional context here - imagine 500 tokens of other information about the system and procedures]
When you identify a needed capability, respond: "CAPABILITY_SELECTED: [capability-name]"
CAPABILITY_TRIGGERS:
debug โ debug-detective
error โ debug-detective
git โ git-manager
issue โ github-issue-creator
EOF
cat > "$TEST_DIR/variation_nested.txt" <<'EOF'
capabilities:
development:
debugging:
tools:
debug-detective:
triggers: ["debug", "error"]
version_control:
tools:
git-manager:
triggers: ["git", "commit"]
issue_tracking:
tools:
github-issue-creator:
triggers: ["issue", "github"]
When you identify a needed capability, respond: "CAPABILITY_SELECTED: [capability-name]"
EOF
cat > "$TEST_DIR/variation_flat.txt" <<'EOF'
Available Capabilities:
- debug-detective (keywords: debug, error, bug)
- git-manager (keywords: git, commit, branch)
- github-issue-creator (keywords: issue, github)
When you identify a needed capability, respond: "CAPABILITY_SELECTED: [capability-name]"
EOF
cat > "$TEST_DIR/variation_action_verbs.txt" <<'EOF'
ACTIONS:
NEED_DEBUG โ USE debug-detective
NEED_FIX โ USE debug-detective
NEED_COMMIT โ USE git-manager
NEED_ISSUE โ USE github-issue-creator
Match intent to action: "CAPABILITY_SELECTED: [capability-name]"
EOF
cat > "$TEST_DIR/variation_passive.txt" <<'EOF'
debug-detective: This capability is available for debugging
git-manager: This capability can be used for git operations
github-issue-creator: This capability exists for issue creation
When appropriate, respond: "CAPABILITY_SELECTED: [capability-name]"
EOF
# Test queries
cat > "$TEST_DIR/test_queries.json" <<'EOF'
[
{"id": 1, "query": "Help me debug this error", "expected": "debug-detective"},
{"id": 2, "query": "I need to fix a bug", "expected": "debug-detective"},
{"id": 3, "query": "Create a git commit", "expected": "git-manager"},
{"id": 4, "query": "Open a GitHub issue", "expected": "github-issue-creator"},
{"id": 5, "query": "The app is crashing", "expected": "debug-detective"},
{"id": 6, "query": "Save my changes", "expected": "git-manager"},
{"id": 7, "query": "Report this problem", "expected": "github-issue-creator"},
{"id": 8, "query": "Help with my code", "expected": "debug-detective"},
{"id": 9, "query": "What is git?", "expected": "none"},
{"id": 10, "query": "Tell me about debugging", "expected": "none"}
]
EOF
# Create isolated test runner Node.js script
cat > "$TEST_DIR/isolated_runner.js" <<'EOF'
#!/usr/bin/env node
const { spawn } = require('child_process');
const fs = require('fs');
const path = require('path');
/**
* Runs each test in a completely isolated process
* No shared context between test runs
*/
class IsolatedTestRunner {
constructor() {
this.results = [];
this.variations = [
'cascade_top',
'cascade_bottom',
'nested',
'flat',
'action_verbs',
'passive'
];
}
async runIsolatedTest(variation, query) {
return new Promise((resolve) => {
const testId = `${variation}_${query.id}_${Date.now()}`;
// Create isolated test script for this specific test
const testScript = `
const fs = require('fs');
// Load ONLY this variation's prompt
const prompt = fs.readFileSync('variation_${variation}.txt', 'utf8');
const query = ${JSON.stringify(query)};
// Simulate selection (in production, this would call LLM API)
function selectCapability(prompt, userQuery) {
const queryLower = userQuery.toLowerCase();
// Simple pattern matching (replace with LLM call)
if (queryLower.includes('debug') || queryLower.includes('error') ||
queryLower.includes('bug') || queryLower.includes('crash')) {
return 'debug-detective';
}
if (queryLower.includes('git') || queryLower.includes('commit')) {
return 'git-manager';
}
if (queryLower.includes('issue') || queryLower.includes('github')) {
return 'github-issue-creator';
}
if (queryLower.includes('what is') || queryLower.includes('tell me about')) {
return 'none';
}
return null;
}
const selected = selectCapability(prompt, query.query);
const result = {
testId: '${testId}',
variation: '${variation}',
query: query.query,
expected: query.expected,
selected: selected,
correct: selected === query.expected,
timestamp: Date.now()
};
console.log(JSON.stringify(result));
`;
// Write test script to temporary file
const scriptFile = path.join(__dirname, `test_${testId}.js`);
fs.writeFileSync(scriptFile, testScript);
// Spawn isolated Node process
const child = spawn('node', [scriptFile], {
cwd: __dirname,
env: { ...process.env, ISOLATED_TEST: 'true' },
stdio: ['ignore', 'pipe', 'pipe']
});
let output = '';
child.stdout.on('data', (data) => {
output += data.toString();
});
child.on('close', () => {
// Clean up test script
fs.unlinkSync(scriptFile);
try {
const result = JSON.parse(output.trim());
resolve(result);
} catch (e) {
resolve({
testId,
variation,
error: 'Failed to parse result',
output
});
}
});
// Timeout after 5 seconds
setTimeout(() => {
child.kill();
resolve({
testId,
variation,
error: 'Timeout'
});
}, 5000);
});
}
async runAllTests() {
const queries = JSON.parse(fs.readFileSync('test_queries.json', 'utf8'));
console.log('Running', this.variations.length * queries.length, 'isolated tests...\n');
for (const variation of this.variations) {
console.log(`\nTesting variation: ${variation}`);
console.log('-'.repeat(40));
const variationResults = [];
let correct = 0;
for (const query of queries) {
const result = await this.runIsolatedTest(variation, query);
variationResults.push(result);
if (result.correct) {
correct++;
process.stdout.write('โ
');
} else if (result.error) {
process.stdout.write('โ ๏ธ ');
} else {
process.stdout.write('โ ');
}
}
const accuracy = (correct / queries.length) * 100;
console.log(`\nAccuracy: ${accuracy.toFixed(1)}% (${correct}/${queries.length})`);
this.results.push({
variation,
accuracy,
correct,
total: queries.length,
details: variationResults
});
}
this.generateReport();
}
generateReport() {
console.log('\n\n๐ ISOLATED TEST RESULTS');
console.log('='.repeat(60));
// Sort by accuracy
const sorted = this.results.sort((a, b) => b.accuracy - a.accuracy);
sorted.forEach((result, index) => {
console.log(`\n${index + 1}. ${result.variation}`);
console.log(` Accuracy: ${result.accuracy.toFixed(1)}%`);
console.log(` Correct: ${result.correct}/${result.total}`);
if (index === 0) {
console.log(' ๐ WINNER');
}
});
// Save results
const filename = `isolated_results_${Date.now()}.json`;
fs.writeFileSync(filename, JSON.stringify(this.results, null, 2));
console.log(`\n๐พ Results saved to: ${filename}`);
}
}
// Run tests
const runner = new IsolatedTestRunner();
runner.runAllTests().catch(console.error);
EOF
# Create Docker-based isolated runner for even more isolation
cat > "$TEST_DIR/Dockerfile" <<'EOF'
FROM node:20-alpine
WORKDIR /test
# Each container runs ONE test with ONE variation
# No shared context possible
COPY isolated_runner.js .
COPY variation_*.txt .
COPY test_queries.json .
CMD ["node", "isolated_runner.js"]
EOF
# Create docker-compose for parallel isolated testing
cat > "$TEST_DIR/docker-compose.yml" <<'EOF'
version: '3.8'
services:
test-runner-1:
build: .
environment:
- TEST_VARIATION=cascade_top
- ISOLATION_LEVEL=maximum
volumes:
- ./results:/test/results
test-runner-2:
build: .
environment:
- TEST_VARIATION=cascade_bottom
- ISOLATION_LEVEL=maximum
volumes:
- ./results:/test/results
test-runner-3:
build: .
environment:
- TEST_VARIATION=nested
- ISOLATION_LEVEL=maximum
volumes:
- ./results:/test/results
test-runner-4:
build: .
environment:
- TEST_VARIATION=flat
- ISOLATION_LEVEL=maximum
volumes:
- ./results:/test/results
EOF
# Create Python script for LLM API testing (even more isolated)
cat > "$TEST_DIR/isolated_llm_test.py" <<'EOF'
#!/usr/bin/env python3
import json
import subprocess
import time
import os
from multiprocessing import Pool, Process
import hashlib
def run_single_test(args):
"""
Run a single test in complete isolation
New process, no shared memory
"""
variation, query, test_id = args
# Create unique test environment
test_hash = hashlib.md5(f"{variation}_{query}_{test_id}".encode()).hexdigest()
# Read variation prompt
with open(f"variation_{variation}.txt", "r") as f:
prompt = f.read()
# Here you would call actual LLM API
# For now, simulate with simple logic
result = {
"test_id": test_hash,
"variation": variation,
"query": query,
"isolated": True,
"process_id": os.getpid()
}
return result
def main():
variations = ["cascade_top", "cascade_bottom", "nested", "flat", "action_verbs", "passive"]
with open("test_queries.json", "r") as f:
queries = json.load(f)
# Create test combinations
tests = []
for variation in variations:
for query in queries:
tests.append((variation, query["query"], query["id"]))
print(f"Running {len(tests)} isolated tests across {os.cpu_count()} processes")
# Run tests in parallel, each in isolated process
with Pool(processes=os.cpu_count()) as pool:
results = pool.map(run_single_test, tests)
# Analyze results
print("\n๐ Results Summary:")
for variation in variations:
variation_results = [r for r in results if r["variation"] == variation]
print(f"\n{variation}: {len(variation_results)} tests")
# Show unique process IDs to prove isolation
unique_pids = set(r["process_id"] for r in variation_results)
print(f" Unique processes used: {len(unique_pids)}")
if __name__ == "__main__":
main()
EOF
echo "โ
Isolated test environment created in: $TEST_DIR"
echo ""
echo "Running isolated tests locally..."
cd "$TEST_DIR"
node isolated_runner.js
echo ""
echo "๐ To run with Docker isolation:"
echo " cd $TEST_DIR"
echo " docker-compose up"
echo ""
echo "๐ To run with Python multiprocessing:"
echo " cd $TEST_DIR"
echo " python3 isolated_llm_test.py"
echo ""
echo "Each test runs in complete isolation - no shared context!"