# Thoughtbox Self-Improvement Loop (SIL) Benchmark Suite
# SPEC: SIL-002
#
# This configuration defines the benchmark suite used to evaluate
# improvements to the thoughtbox MCP server. It includes:
# - Three evaluation tiers (smoke-test, regression, real-world)
# - Anchor point sampling for cost reduction
# - Proctoring settings for gaming prevention
# - Target repositories for real-world issue testing
name: thoughtbox-improvement
version: "1.0"
description: Benchmark suite for evaluating thoughtbox server improvements
# =============================================================================
# Evaluation Tiers
# =============================================================================
# Each tier represents a different depth of testing.
# Tiers are run in order; failing an earlier tier skips later ones.
tiers:
# ---------------------------------------------------------------------------
# Tier 1: Smoke Test (< 30 seconds)
# Fast sanity check - does the system still work at all?
# ---------------------------------------------------------------------------
- id: smoke-test
name: Smoke Test
description: Quick sanity check that core functionality works
timeout_seconds: 30
required_pass_rate: 1.0 # 100% must pass
tests:
- id: init-basic
toolhost: init
name: Basic initialization
description: Verify init tool responds correctly
steps:
- operation: get_state
expectedBehavior: Returns current connection stage
- id: thought-basic
toolhost: thoughtbox
name: Single thought
description: Verify thought tool accepts a basic thought
steps:
- operation: thought
args:
thought: "Testing basic thought functionality"
thoughtNumber: 1
totalThoughts: 1
expectedBehavior: Returns acknowledgment with thought ID
- id: notebook-basic
toolhost: notebook
name: Create notebook
description: Verify notebook creation works
steps:
- operation: create_notebook
args:
title: "Test Notebook"
expectedBehavior: Returns notebook ID
# ---------------------------------------------------------------------------
# Tier 2: Regression Tests (< 5 minutes)
# Comprehensive tests against known-good baseline
# ---------------------------------------------------------------------------
- id: regression
name: Regression Tests
description: Full test suite comparing against baseline metrics
timeout_seconds: 300
required_pass_rate: 0.95 # 95% must pass
tests:
- id: thought-chain-10
toolhost: thoughtbox
name: 10-thought chain
description: Build a chain of 10 sequential thoughts
steps:
- operation: thought
args:
thought: "First thought in chain"
thoughtNumber: 1
totalThoughts: 10
expectedBehavior: Chain links correctly
- id: thought-branch
toolhost: thoughtbox
name: Branch creation
description: Create a branch from an existing thought
steps:
- operation: thought
args:
thought: "Alternative approach"
branchFromThought: 5
branchId: "alternative-a"
expectedBehavior: Branch created with correct linkage
- id: thought-revision
toolhost: thoughtbox
name: Thought revision
description: Revise an existing thought
steps:
- operation: thought
args:
thought: "Revised understanding"
revisesThought: 3
expectedBehavior: Revision linked to original
- id: notebook-cells
toolhost: notebook
name: Notebook cell operations
description: Add, execute, and modify cells
steps:
- operation: add_cell
args:
notebookId: "${notebookId}"
type: code
language: typescript
content: "const x = 1 + 1;"
expectedBehavior: Cell added with ID
- id: mental-models-apply
toolhost: mental_models
name: Apply mental model
description: Apply a mental model to a problem
steps:
- operation: apply_model
args:
model: first_principles
problem: "Why is the sky blue?"
expectedBehavior: Model applied with structured output
- id: session-export
toolhost: init
name: Session export
description: Export a session with thoughts
steps:
- operation: export_session
args:
format: json
expectedBehavior: Returns valid JSON export
# ---------------------------------------------------------------------------
# Tier 3: Real-World Tests (< 30 minutes)
# Tests against actual GitHub issues and use cases
# ---------------------------------------------------------------------------
- id: real-world
name: Real-World Tests
description: Tests using actual GitHub issues as scenarios
timeout_seconds: 1800
required_pass_rate: 0.80 # 80% must pass
tests:
- id: github-issue-reasoning
toolhost: thoughtbox
name: GitHub issue reasoning
description: Use thoughtbox to reason about a real GitHub issue
source: github
steps:
- operation: thought
args:
thought: "Analyzing issue: ${issue.title}"
thoughtNumber: 1
totalThoughts: 5
expectedBehavior: Coherent analysis chain
# =============================================================================
# Anchor Point Sampling
# =============================================================================
# To reduce evaluation costs, we sample from "anchor points" - tests that
# correlate strongly with overall system quality. If anchor points pass,
# we can skip some other tests with high confidence.
anchor_points:
enabled: true
confidence_threshold: 0.95 # Skip tests if anchor correlation > 95%
# Tests that predict overall quality
anchors:
- test_id: thought-chain-10
correlation: 0.92 # Predicts 92% of overall outcomes
description: Chain building tests fundamental linking logic
- test_id: session-export
correlation: 0.88
description: Export tests persistence layer
- test_id: mental-models-apply
correlation: 0.85
description: Mental models test schema validation
# When anchor points pass, these tests can be sampled
sampling_rules:
- trigger:
anchor_id: thought-chain-10
result: pass
skip_probability: 0.7 # Skip 70% of related tests
affected_tests:
- thought-basic
- thought-revision
# =============================================================================
# Proctoring & Gaming Prevention
# =============================================================================
# Settings to prevent the SIL from gaming the benchmarks
proctoring:
enabled: true
# Baseline contamination detection
contamination_detection:
# Hash of baseline test data - if the system sees these exact patterns
# in training, the test is contaminated
enabled: true
baseline_hash_check: true
# Variance injection - add randomness to prevent memorization
variance_injection:
enabled: true
# Permute the order of tests
permute_test_order: true
# Add slight variations to test inputs
input_fuzzing: true
fuzzing_seed_rotation: weekly
# Statistical anomaly detection
anomaly_detection:
enabled: true
# Flag if improvement on benchmark > improvement on real tasks
benchmark_real_world_gap_threshold: 0.15
# Flag if suspiciously perfect scores
perfect_score_investigation: true
# =============================================================================
# Target Repositories
# =============================================================================
# Real repositories to source issues from for real-world testing
target_repos:
- owner: anthropics
repo: claude-code
issue_label: good-first-issue
max_issues: 10
- owner: modelcontextprotocol
repo: servers
issue_label: bug
max_issues: 5
- owner: Kastalien-Research
repo: thoughtbox
issue_label: enhancement
max_issues: 5
# =============================================================================
# Execution Settings
# =============================================================================
execution:
# Maximum concurrent test executions
max_concurrency: 4
# Retry failed tests this many times
retry_count: 2
# Time between retries
retry_delay_seconds: 5
# Output directory for results
output_dir: ./dgm-specs/history/runs
# Baseline comparison settings
baseline:
# Path to baseline results
path: ./dgm-specs/history/baseline.json
# Regression thresholds (percent)
thresholds:
duration_ms_increase_max: 20
response_bytes_increase_max: 10
pass_rate_decrease_max: 5
# =============================================================================
# Reporting
# =============================================================================
reporting:
# Format for reports
formats:
- json
- markdown
# Where to output reports
output_dir: ./dgm-specs/history/reports
# Include detailed timing breakdowns
timing_breakdown: true
# Include test output samples
include_output_samples: true
sample_limit: 3