#!/usr/bin/env npx tsx
/**
* CLI for SIL Benchmark Harness
* SPEC-SIL-100: Benchmark Harness
*
* Usage:
* npx tsx dgm-specs/harness/cli.ts run # Run benchmarks, compare to baseline
* npx tsx dgm-specs/harness/cli.ts baseline # Establish new baseline
* npx tsx dgm-specs/harness/cli.ts report # Generate comparison report
* npx tsx dgm-specs/harness/cli.ts --help # Show help
*/
import { config } from 'dotenv';
config();
import {
runAllBenchmarks,
formatBenchmarkRun,
TEST_CONFIGS,
} from './benchmark-runner.js';
import {
loadBaseline,
saveBaseline,
saveToHistory,
compareToBaseline,
formatComparison,
} from './baseline.js';
const HELP_TEXT = `
SIL Benchmark Harness CLI
SPEC-SIL-100: Benchmark Harness
Usage:
npx tsx dgm-specs/harness/cli.ts <command> [options]
Commands:
run Run all benchmarks, compare to baseline if exists
baseline Run benchmarks and save as new baseline
report Show comparison between current run and baseline
list List available test configurations
help Show this help message
Options:
--json Output results as JSON
--url <url> MCP server URL (default: http://localhost:1731/mcp)
Examples:
# Run benchmarks and compare to baseline
npx tsx dgm-specs/harness/cli.ts run
# Establish new baseline
npx tsx dgm-specs/harness/cli.ts baseline
# Run with custom MCP URL
npx tsx dgm-specs/harness/cli.ts run --url http://localhost:4000/mcp
# Output as JSON for CI integration
npx tsx dgm-specs/harness/cli.ts run --json
`;
async function main() {
const args = process.argv.slice(2);
const command = args[0];
const jsonOutput = args.includes('--json');
const urlIndex = args.indexOf('--url');
const mcpUrl = urlIndex !== -1 ? args[urlIndex + 1] : 'http://localhost:1731/mcp';
// Validate API key early
if (!process.env.ANTHROPIC_API_KEY && command !== 'list' && command !== 'help' && command !== '--help' && command !== '-h') {
console.error(`
Error: ANTHROPIC_API_KEY environment variable is not set.
The benchmark harness uses the Claude Agent SDK which requires an Anthropic API key.
To fix this:
1. Create a .env file with: ANTHROPIC_API_KEY=your-key
2. Or set the environment variable directly
`);
process.exit(1);
}
switch (command) {
case 'run': {
const currentRun = await runAllBenchmarks(TEST_CONFIGS, mcpUrl);
const baseline = loadBaseline();
if (jsonOutput) {
if (baseline) {
const comparison = compareToBaseline(currentRun, baseline);
console.log(JSON.stringify({ run: currentRun, comparison }, null, 2));
process.exit(comparison.verdict === 'PASS' ? 0 : 1);
} else {
console.log(JSON.stringify({ run: currentRun, comparison: null }, null, 2));
}
} else {
console.log(formatBenchmarkRun(currentRun));
if (baseline) {
const comparison = compareToBaseline(currentRun, baseline);
console.log(formatComparison(comparison));
process.exit(comparison.verdict === 'PASS' ? 0 : 1);
} else {
console.log('\nNo baseline found. Run `npx tsx dgm-specs/harness/cli.ts baseline` to establish one.\n');
}
}
// Save to history
saveToHistory(currentRun);
break;
}
case 'baseline': {
console.log('Establishing new baseline...\n');
const run = await runAllBenchmarks(TEST_CONFIGS, mcpUrl);
if (jsonOutput) {
console.log(JSON.stringify(run, null, 2));
} else {
console.log(formatBenchmarkRun(run));
}
saveBaseline(run);
saveToHistory(run);
if (!jsonOutput) {
console.log(`\n✓ Baseline established: ${run.runId}`);
console.log(' Future runs will compare against this baseline.\n');
}
break;
}
case 'report': {
const baseline = loadBaseline();
if (!baseline) {
console.error('No baseline found. Run `baseline` command first.');
process.exit(1);
}
const currentRun = await runAllBenchmarks(TEST_CONFIGS, mcpUrl);
const comparison = compareToBaseline(currentRun, baseline);
if (jsonOutput) {
console.log(JSON.stringify({ baseline, current: currentRun, comparison }, null, 2));
} else {
console.log(formatBenchmarkRun(currentRun));
console.log(formatComparison(comparison));
}
saveToHistory(currentRun);
process.exit(comparison.verdict === 'PASS' ? 0 : 1);
}
case 'list': {
console.log('\nAvailable Test Configurations:\n');
for (const config of TEST_CONFIGS) {
console.log(` ${config.id}`);
console.log(` Name: ${config.name}`);
console.log(` Toolhost: ${config.toolhost}`);
console.log(` Steps: ${config.steps.length}`);
console.log('');
}
break;
}
case 'help':
case '--help':
case '-h':
case undefined: {
console.log(HELP_TEXT);
break;
}
default: {
console.error(`Unknown command: ${command}`);
console.log(HELP_TEXT);
process.exit(1);
}
}
}
main().catch(error => {
console.error('Benchmark error:', error);
process.exit(1);
});