Thoughtbox

Thoughtbox
benchmarks

config-loader.ts•10 KiB

/** * Benchmark Suite Configuration Loader * SPEC: SIL-002 * * Loads and validates the benchmark suite configuration from YAML. * Uses Zod for schema validation to ensure configuration integrity. */ import { z } from "zod"; import { readFileSync, existsSync } from "fs"; import { parse as parseYaml } from "yaml"; import { resolve, dirname } from "path"; import { fileURLToPath } from "url"; // ============================================================================= // Schema Definitions // ============================================================================= /** * Test step within a test case */ export const TestStepSchema = z.object({ operation: z.string().min(1), args: z.record(z.string(), z.unknown()).optional(), expectedBehavior: z.string().min(1), }); export type TestStep = z.infer<typeof TestStepSchema>; /** * Individual test case configuration */ export const TestCaseSchema = z.object({ id: z.string().min(1), toolhost: z.enum(["thoughtbox", "notebook", "mental_models", "memory", "init"]), name: z.string().min(1), description: z.string().min(1), source: z.enum(["github", "synthetic"]).optional(), steps: z.array(TestStepSchema).min(1), }); export type TestCase = z.infer<typeof TestCaseSchema>; /** * Evaluation tier configuration */ export const TierConfigSchema = z.object({ id: z.string().min(1), name: z.string().min(1), description: z.string().min(1), timeout_seconds: z.number().int().positive(), required_pass_rate: z.number().min(0).max(1), tests: z.array(TestCaseSchema).min(1), }); export type TierConfig = z.infer<typeof TierConfigSchema>; /** * Anchor point definition for sampling */ export const AnchorPointSchema = z.object({ test_id: z.string().min(1), correlation: z.number().min(0).max(1), description: z.string().min(1), }); export type AnchorPoint = z.infer<typeof AnchorPointSchema>; /** * Sampling rule triggered by anchor point results */ export const SamplingRuleSchema = z.object({ trigger: z.object({ anchor_id: z.string().min(1), result: z.enum(["pass", "fail"]), }), skip_probability: z.number().min(0).max(1), affected_tests: z.array(z.string()).min(1), }); export type SamplingRule = z.infer<typeof SamplingRuleSchema>; /** * Anchor points configuration for cost reduction */ export const AnchorPointsConfigSchema = z.object({ enabled: z.boolean(), confidence_threshold: z.number().min(0).max(1), anchors: z.array(AnchorPointSchema), sampling_rules: z.array(SamplingRuleSchema), }); export type AnchorPointsConfig = z.infer<typeof AnchorPointsConfigSchema>; /** * Contamination detection settings */ export const ContaminationDetectionSchema = z.object({ enabled: z.boolean(), baseline_hash_check: z.boolean(), }); export type ContaminationDetection = z.infer<typeof ContaminationDetectionSchema>; /** * Variance injection settings */ export const VarianceInjectionSchema = z.object({ enabled: z.boolean(), permute_test_order: z.boolean(), input_fuzzing: z.boolean(), fuzzing_seed_rotation: z.enum(["daily", "weekly", "monthly"]), }); export type VarianceInjection = z.infer<typeof VarianceInjectionSchema>; /** * Anomaly detection settings */ export const AnomalyDetectionSchema = z.object({ enabled: z.boolean(), benchmark_real_world_gap_threshold: z.number().min(0).max(1), perfect_score_investigation: z.boolean(), }); export type AnomalyDetection = z.infer<typeof AnomalyDetectionSchema>; /** * Proctoring configuration for gaming prevention */ export const ProctoringConfigSchema = z.object({ enabled: z.boolean(), contamination_detection: ContaminationDetectionSchema, variance_injection: VarianceInjectionSchema, anomaly_detection: AnomalyDetectionSchema, }); export type ProctoringConfig = z.infer<typeof ProctoringConfigSchema>; /** * Target repository for real-world testing */ export const TargetRepoSchema = z.object({ owner: z.string().min(1), repo: z.string().min(1), issue_label: z.string().min(1), max_issues: z.number().int().positive(), }); export type TargetRepo = z.infer<typeof TargetRepoSchema>; /** * Baseline threshold configuration */ export const BaselineThresholdsSchema = z.object({ duration_ms_increase_max: z.number().min(0), response_bytes_increase_max: z.number().min(0), pass_rate_decrease_max: z.number().min(0), }); export type BaselineThresholds = z.infer<typeof BaselineThresholdsSchema>; /** * Execution settings */ export const ExecutionConfigSchema = z.object({ max_concurrency: z.number().int().positive(), retry_count: z.number().int().min(0), retry_delay_seconds: z.number().min(0), output_dir: z.string().min(1), baseline: z.object({ path: z.string().min(1), thresholds: BaselineThresholdsSchema, }), }); export type ExecutionConfig = z.infer<typeof ExecutionConfigSchema>; /** * Reporting settings */ export const ReportingConfigSchema = z.object({ formats: z.array(z.enum(["json", "markdown", "html"])).min(1), output_dir: z.string().min(1), timing_breakdown: z.boolean(), include_output_samples: z.boolean(), sample_limit: z.number().int().positive(), }); export type ReportingConfig = z.infer<typeof ReportingConfigSchema>; /** * Complete benchmark suite configuration */ export const BenchmarkConfigSchema = z.object({ name: z.string().min(1), version: z.string().min(1), description: z.string().min(1), tiers: z.array(TierConfigSchema).min(1), anchor_points: AnchorPointsConfigSchema, proctoring: ProctoringConfigSchema, target_repos: z.array(TargetRepoSchema), execution: ExecutionConfigSchema, reporting: ReportingConfigSchema, }); export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>; // ============================================================================= // Configuration Loader // ============================================================================= /** * Default path to the benchmark suite configuration */ const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); const DEFAULT_CONFIG_PATH = resolve(__dirname, "suite.yaml"); /** * Error thrown when configuration loading or validation fails */ export class ConfigLoadError extends Error { constructor( message: string, public readonly cause?: unknown ) { super(message); this.name = "ConfigLoadError"; } } /** * Load and validate the benchmark suite configuration * * @param configPath - Optional path to configuration file. Defaults to benchmarks/suite.yaml * @returns Validated BenchmarkConfig object * @throws ConfigLoadError if file doesn't exist, can't be parsed, or fails validation */ export function loadBenchmarkConfig(configPath?: string): BenchmarkConfig { const path = configPath ?? DEFAULT_CONFIG_PATH; // Check file exists if (!existsSync(path)) { throw new ConfigLoadError(`Configuration file not found: ${path}`); } // Read file let content: string; try { content = readFileSync(path, "utf-8"); } catch (err) { throw new ConfigLoadError(`Failed to read configuration file: ${path}`, err); } // Parse YAML let parsed: unknown; try { parsed = parseYaml(content); } catch (err) { throw new ConfigLoadError(`Failed to parse YAML: ${path}`, err); } // Validate with Zod const result = BenchmarkConfigSchema.safeParse(parsed); if (!result.success) { const issues = result.error.issues .map((i) => ` - ${i.path.join(".")}: ${i.message}`) .join("\n"); throw new ConfigLoadError( `Configuration validation failed:\n${issues}`, result.error ); } return result.data; } /** * Get a specific tier configuration by ID * * @param config - The benchmark configuration * @param tierId - The tier ID to find * @returns The tier configuration or undefined if not found */ export function getTierById( config: BenchmarkConfig, tierId: string ): TierConfig | undefined { return config.tiers.find((t) => t.id === tierId); } /** * Get all test IDs from a tier * * @param tier - The tier configuration * @returns Array of test IDs */ export function getTestIds(tier: TierConfig): string[] { return tier.tests.map((t) => t.id); } /** * Get anchor point tests from configuration * * @param config - The benchmark configuration * @returns Array of anchor point test IDs */ export function getAnchorTestIds(config: BenchmarkConfig): string[] { if (!config.anchor_points.enabled) { return []; } return config.anchor_points.anchors.map((a) => a.test_id); } /** * Determine which tests to skip based on anchor point results * * @param config - The benchmark configuration * @param anchorResults - Map of anchor test ID to pass/fail result * @returns Array of test IDs to skip */ export function getTestsToSkip( config: BenchmarkConfig, anchorResults: Map<string, boolean> ): string[] { if (!config.anchor_points.enabled) { return []; } const testsToSkip: string[] = []; for (const rule of config.anchor_points.sampling_rules) { const anchorPassed = anchorResults.get(rule.trigger.anchor_id); const triggerMatches = (rule.trigger.result === "pass" && anchorPassed === true) || (rule.trigger.result === "fail" && anchorPassed === false); if (triggerMatches) { // Apply skip probability for (const testId of rule.affected_tests) { if (Math.random() < rule.skip_probability) { testsToSkip.push(testId); } } } } return [...new Set(testsToSkip)]; // Deduplicate } /** * Check if proctoring features are enabled * * @param config - The benchmark configuration * @returns Object describing which proctoring features are active */ export function getProctoringStatus(config: BenchmarkConfig): { enabled: boolean; contamination: boolean; variance: boolean; anomaly: boolean; } { return { enabled: config.proctoring.enabled, contamination: config.proctoring.enabled && config.proctoring.contamination_detection.enabled, variance: config.proctoring.enabled && config.proctoring.variance_injection.enabled, anomaly: config.proctoring.enabled && config.proctoring.anomaly_detection.enabled, }; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/glassBead-tc/Thoughtbox'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

config-loader.ts•10 KiB