# Benchmark Registry
# Maps capability targets to concrete evaluation methods
benchmarks:
# External benchmarks - source of truth (we don't control them)
- name: swe-bench-lite
type: external
url: https://github.com/princeton-nlp/SWE-bench
targets: [context-retrieval, reasoning-coherence]
cost_estimate: "$0.50-2.00 per issue"
notes: "Use verified subset for faster iteration"
- name: langchain-issues
type: external-repo
url: https://github.com/langchain-ai/langchain
issue_filter: "label:bug created:>2024-01-01"
targets: [reasoning-coherence]
cost_estimate: "varies"
notes: "High agentic user base, good proxy for real-world"
- name: letta-context-bench
type: external
url: https://github.com/letta-ai/letta-evals
targets: [context-retrieval]
notes: "Memory/context-focused evaluations"
# Internal benchmarks - our own test suites
- name: thoughtbox-behavioral
type: internal
path: tests/behavioral/
targets: [all]
notes: "Agentic test suite - agent-executed, semantically evaluated"
# Benchmark Selection Principles:
# 1. External CI is the source of truth - if we don't control the test suite, we can't game it
# 2. Agentic-heavy repos preferred - LangChain, LlamaIndex, CrewAI issues reflect real agent usage patterns
# 3. Cost-aware iteration - use cheaper benchmarks for exploration, expensive ones for validation
# 4. Multiple signals - no single benchmark captures everything; triangulate