version: '2.0'
created: '2026-02-16'
purpose: 'CQS multi-vendor judge scoring — V2 pairwise comparisons'
# Model used to generate responses (Stage 1)
caller:
model: 'claude-sonnet-4-5-20250929'
provider: 'anthropic'
api_key_env: 'ANTHROPIC_API_KEY'
max_tokens: 2048
max_tool_rounds: 20
judges:
anthropic:
model: 'claude-opus-4-5-20251101'
provider: 'anthropic'
api_key_env: 'ANTHROPIC_API_KEY'
temperature: 0.0
notes: 'Anthropic flagship — potential self-enhancement bias on Claude outputs'
openai:
model: 'gpt-5.2'
provider: 'openai'
api_key_env: 'OPENAI_API_KEY'
temperature: null
max_tokens_param: 'max_completion_tokens'
notes: 'OpenAI flagship — does NOT accept temperature param'
google:
model: 'gemini-3-pro-preview'
provider: 'google'
api_key_env: 'GEMINI_API_KEY'
temperature: 1.0
max_output_tokens: 8192 # Increased from default 4096 - Gemini truncates complex JSON
notes: 'Google flagship — temperature must be 1.0 per Google docs'
# V2 pairwise comparisons — maps comparison name to response file pairs
# condition_a is presented first in odd passes, condition_b first in even passes
comparisons:
rag_vs_pragmatics:
condition_a: 'rag'
condition_b: 'pragmatics'
file_a: 'results/v2_redo/stage1/rag_responses_20260216_055354.jsonl'
file_b: 'results/v2_redo/stage1/pragmatics_responses_20260216_074817.jsonl'
notes: 'Core research question — run first'
control_vs_pragmatics:
condition_a: 'control'
condition_b: 'pragmatics'
file_a: 'results/v2_redo/stage1/control_responses_20260216_055354.jsonl'
file_b: 'results/v2_redo/stage1/pragmatics_responses_20260216_074817.jsonl'
notes: 'Pragmatics vs baseline'
control_vs_rag:
condition_a: 'control'
condition_b: 'rag'
file_a: 'results/v2_redo/stage1/control_responses_20260216_055354.jsonl'
file_b: 'results/v2_redo/stage1/rag_responses_20260216_055354.jsonl'
notes: 'Does RAG even help?'
pipeline:
random_seed: 42
max_tokens: 4096
rate_limit_delay: 1.0
checkpoint_interval: 10
max_workers_per_vendor: 5 # concurrent calls per API vendor (3 safe, 5 tested OK)
num_passes: 6 # 6 passes per judge per query
# Pass schedule:
# Pass 1: condition_a_first (base scoring)
# Pass 2: condition_b_first (position bias)
# Pass 3: condition_a_first (test-retest #1)
# Pass 4: condition_b_first (test-retest #1 x position)
# Pass 5: condition_a_first (test-retest #2, CIs)
# Pass 6: condition_b_first (test-retest #2 x position, CIs)
scoring:
dimensions: ['D1', 'D2', 'D3', 'D4', 'D5', 'D6']
scale_min: 0
scale_max: 2
request_confidence: true # Ask judges for per-dimension confidence
confidence_scale: [1, 2, 3, 4, 5]
fidelity:
model: 'claude-haiku-4-5-20251001'
provider: 'anthropic'
api_key_env: 'ANTHROPIC_API_KEY'
temperature: 0.0
max_tokens: 4096
max_retries: 3
rate_limit_delay: 0.5
aggregate:
input_file: 'results/v2_redo/stage3/fidelity_20260219_214225.jsonl'
output_dir: 'results/v2_redo/stage3/analysis'
battery_path: 'src/eval/battery/queries.yaml'
analysis:
alpha: 0.05
bootstrap_iterations: 10000
correction_method: 'holm' # Holm-Bonferroni FWER control (uniformly more powerful than plain Bonferroni)
dimensions: ['D1', 'D2', 'D3', 'D4', 'D5'] # D6 excluded from CQS composite (see cqs_rubric_specification.md)
output_dir: 'results/v2_redo/stage2/analysis'
paths:
# V1 (archived, do not use)
stage1_results: 'results/cqs_responses_20260213_091530.jsonl'
# V2 paths
battery: 'src/eval/battery/queries.yaml'
output_dir: 'results/v2_redo/stage2'
checkpoint_dir: 'results/v2_redo/stage2/checkpoints'
analysis_output: 'results/v2_redo/stage2/analysis'
# V2 Stage 3 fidelity inputs (same files as Stage 1 outputs)
stage3_inputs:
control: 'results/v2_redo/stage1/control_responses_20260216_055354.jsonl'
rag: 'results/v2_redo/stage1/rag_responses_20260216_055354.jsonl'
pragmatics: 'results/v2_redo/stage1/pragmatics_responses_20260216_074817.jsonl'
stage3_output_dir: 'results/v2_redo/stage3'
# V1 archived references (do not mix with V2)
stage2_valid_run_ids:
- "20260213_125057" # OpenAI v3 part 1 (17 queries, 100 records)
- "20260213_125945" # OpenAI v3 part 2 (23 queries, 134 records)
- "20260214_211036" # Anthropic + Google v3 (39 queries each, 468 records)