Thoughtbox

Thoughtbox
benchmarks

tiered-evaluator.ts•10.6 KiB

/** * Tiered Evaluation Pipeline * SPEC: SIL-004 * * Implements tiered evaluation that runs cheap gates first, terminating * early on failure to minimize cost while ensuring quality. * * Tier order: * 1. Smoke test ($0.10) - Quick sanity check * 2. Regression ($1.00) - Full test suite * 3. Real-world ($10.00) - Actual benchmarks * * Early termination saves 80-90% of evaluation cost. */ import { loadBenchmarkConfig, type BenchmarkConfig, type TierConfig, } from "./config-loader.js"; import { improvementTracker } from "../src/observatory/improvement-tracker.js"; // ============================================================================= // Types // ============================================================================= /** * A code modification to be evaluated */ export interface CodeModification { id: string; type: string; files: string[]; diff: string; } /** * Result of executing a single tier */ export interface TierResult { tier: string; tierId: string; score: number; passed: boolean; cost: number; duration_ms: number; details?: Record<string, unknown>; } /** * Overall evaluation result */ export interface EvaluationResult { passed: boolean; passedTiers: string[]; failedAt: string | null; tierResults: TierResult[]; totalCost: number; totalDuration_ms: number; reason?: string; } /** * Interface for tier executors */ export interface TierExecutor { name: string; execute(modification: CodeModification): Promise<TierResult>; } /** * Configuration for tiered evaluator */ export interface TieredEvaluatorConfig { /** Path to benchmark config YAML */ configPath?: string; /** Enable observatory tracking */ trackingEnabled?: boolean; } // ============================================================================= // Default Tier Costs // ============================================================================= const DEFAULT_TIER_COSTS: Record<string, number> = { "smoke-test": 0.1, regression: 1.0, "real-world": 10.0, }; // ============================================================================= // TieredEvaluator // ============================================================================= /** * Tiered evaluation pipeline for code modifications. * * Runs evaluation tiers in order, stopping on first failure to minimize cost. * Integrates with ImprovementTracker for observability. */ export class TieredEvaluator { private config: BenchmarkConfig; private executors: Map<string, TierExecutor> = new Map(); private trackingEnabled: boolean; constructor(options: TieredEvaluatorConfig = {}) { this.config = loadBenchmarkConfig(options.configPath); this.trackingEnabled = options.trackingEnabled ?? true; this.registerDefaultExecutors(); } /** * Evaluate a modification through all tiers. * Stops on first required tier failure. * * @param modification - The code modification to evaluate * @returns Evaluation result with all tier scores */ async evaluate(modification: CodeModification): Promise<EvaluationResult> { const tierResults: TierResult[] = []; const passedTiers: string[] = []; let totalCost = 0; let totalDuration = 0; let failedAt: string | null = null; for (const tierConfig of this.config.tiers) { const executor = this.executors.get(tierConfig.id); if (!executor) { console.warn(`No executor for tier: ${tierConfig.id}`); continue; } const startTime = Date.now(); let result: TierResult; try { result = await executor.execute(modification); } catch (err) { // Executor threw - treat as failure result = { tier: tierConfig.name, tierId: tierConfig.id, score: 0, passed: false, cost: DEFAULT_TIER_COSTS[tierConfig.id] ?? 0, duration_ms: Date.now() - startTime, details: { error: err instanceof Error ? err.message : String(err), }, }; } const duration = Date.now() - startTime; result.duration_ms = duration; // Check if passed threshold const passed = result.score >= tierConfig.required_pass_rate; result.passed = passed; tierResults.push(result); totalCost += result.cost; totalDuration += duration; // Track in Observatory if (this.trackingEnabled) { improvementTracker.trackEvaluation( { tier: tierConfig.id, tierName: tierConfig.name, score: result.score, threshold: tierConfig.required_pass_rate, passed: result.passed, details: result.details, }, result.cost, result.passed ); } if (passed) { passedTiers.push(tierConfig.id); } else { // Early termination on failure (all tiers are required by default) failedAt = tierConfig.id; break; } } const passed = failedAt === null; return { passed, passedTiers, failedAt, tierResults, totalCost, totalDuration_ms: totalDuration, reason: failedAt ? `Failed ${failedAt} with score ${tierResults.find((r) => r.tierId === failedAt)?.score?.toFixed(2)}` : undefined, }; } /** * Register a custom tier executor. * * @param executor - The executor to register */ registerExecutor(executor: TierExecutor): void { this.executors.set(executor.name, executor); } /** * Get the current benchmark configuration. */ getConfig(): BenchmarkConfig { return this.config; } /** * Get all registered tier IDs. */ getRegisteredTiers(): string[] { return Array.from(this.executors.keys()); } /** * Register default tier executors. * * These provide basic implementations that can be overridden * with custom executors for specific use cases. */ private registerDefaultExecutors(): void { // Smoke test: Quick sanity check this.executors.set("smoke-test", { name: "smoke-test", execute: async (_modification: CodeModification): Promise<TierResult> => { // Default implementation runs behavioral tests try { const { execSync } = await import("child_process"); execSync("npm run test:behavioral -- --reporter=json 2>/dev/null", { encoding: "utf-8", timeout: 5 * 60 * 1000, // 5 minutes stdio: "pipe", }); return { tier: "Smoke Test", tierId: "smoke-test", score: 1.0, passed: true, cost: DEFAULT_TIER_COSTS["smoke-test"], duration_ms: 0, }; } catch { return { tier: "Smoke Test", tierId: "smoke-test", score: 0, passed: false, cost: DEFAULT_TIER_COSTS["smoke-test"], duration_ms: 0, }; } }, }); // Regression: Full test suite this.executors.set("regression", { name: "regression", execute: async (_modification: CodeModification): Promise<TierResult> => { try { const { execSync } = await import("child_process"); const output = execSync("npm test -- --reporter=json 2>/dev/null", { encoding: "utf-8", timeout: 30 * 60 * 1000, // 30 minutes stdio: "pipe", }); // Try to parse JSON output let score = 1.0; let details: Record<string, unknown> = {}; try { const results = JSON.parse(output); if (results.numTotalTests && results.numTotalTests > 0) { score = results.numPassedTests / results.numTotalTests; details = { total: results.numTotalTests, passed: results.numPassedTests, failed: results.numFailedTests, }; } } catch { // JSON parsing failed, assume pass if command succeeded details = { note: "Could not parse test output" }; } return { tier: "Regression Tests", tierId: "regression", score, passed: score >= 0.95, cost: DEFAULT_TIER_COSTS["regression"], duration_ms: 0, details, }; } catch { return { tier: "Regression Tests", tierId: "regression", score: 0, passed: false, cost: DEFAULT_TIER_COSTS["regression"], duration_ms: 0, }; } }, }); // Real-world: Placeholder for SWE-bench integration this.executors.set("real-world", { name: "real-world", execute: async (_modification: CodeModification): Promise<TierResult> => { // TODO: Integrate with SWE-bench anchor points from SPEC-SIL-003 console.log( "Real-world tier: Using placeholder (SWE-bench not yet integrated)" ); return { tier: "Real-World Tests", tierId: "real-world", score: 0.8, // Placeholder score passed: true, cost: DEFAULT_TIER_COSTS["real-world"], duration_ms: 0, details: { note: "SWE-bench integration pending" }, }; }, }); } } // ============================================================================= // Mock Executor for Testing // ============================================================================= /** * Create a mock executor for testing purposes. * * @param tierId - The tier ID * @param score - The score to return * @param cost - The cost to report */ export function createMockExecutor( tierId: string, score: number, cost: number = DEFAULT_TIER_COSTS[tierId] ?? 0 ): TierExecutor { return { name: tierId, execute: async (): Promise<TierResult> => ({ tier: tierId, tierId, score, passed: score >= 0.8, cost, duration_ms: 100, }), }; } // ============================================================================= // Singleton // ============================================================================= let evaluatorInstance: TieredEvaluator | null = null; /** * Get the singleton TieredEvaluator instance. * * @param options - Configuration options (only used on first call) */ export function getTieredEvaluator( options?: TieredEvaluatorConfig ): TieredEvaluator { if (!evaluatorInstance) { evaluatorInstance = new TieredEvaluator(options); } return evaluatorInstance; } /** * Reset the singleton instance (for testing). */ export function resetTieredEvaluator(): void { evaluatorInstance = null; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/glassBead-tc/Thoughtbox'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

tiered-evaluator.ts•10.6 KiB