#!/usr/bin/env python3
"""
Test SGR Searcher on evaluation dataset.
Runs a few questions and shows detailed step-by-step output.
"""
import json
import os
import sys
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent / "src"))
from searchers import SGRSearcherGemini
# Configuration
REPO_BASE = Path(__file__).parent / "data" / "issues"
DATASET_PATH = Path(__file__).parent / "data" / "dataset.jsonl"
def load_questions(limit: int = 3) -> list:
"""Load questions from dataset."""
questions = []
with open(DATASET_PATH) as f:
for line in f:
if line.strip():
questions.append(json.loads(line))
if len(questions) >= limit:
break
return questions
def test_question(searcher, question: dict) -> dict:
"""Test a single question and return detailed results."""
print(f"\n{'='*80}")
print(f"Question ID: {question['id']}")
print(f"Repo: {question['repo_path']}")
print(f"Query: {question['query'][:100]}...")
print(f"{'='*80}")
repo_path = REPO_BASE / question['repo_path'] / "repo"
if not repo_path.exists():
print(f"ERROR: Repo not found at {repo_path}")
return {"error": "repo not found"}
# Run search
result = searcher.search(
query=question['query'],
repo_path=str(repo_path),
)
# Analyze results
expected_files = [item['file_path'] for item in question.get('expected_items', [])]
found_files = [item.file_path for item in result.items]
# Check matches
matched = []
for expected in expected_files:
for found in found_files:
if expected in found or found in expected:
matched.append(expected)
break
print(f"\n--- RESULTS ---")
print(f"Found {len(result.items)} files:")
for item in result.items[:5]:
print(f" - {item.file_path}")
print(f"\nExpected {len(expected_files)} files:")
for f in expected_files:
status = "✅" if f in matched else "❌"
print(f" {status} {f}")
print(f"\nMatched: {len(matched)}/{len(expected_files)}")
print(f"Time: {result.total_time_ms:.0f}ms (LLM: {result.execution_time_ms:.0f}ms, Tools: {result.tool_time_ms:.0f}ms)")
print(f"Patterns used: {result.patterns_used[:3]}...")
if result.error:
print(f"Error: {result.error}")
return {
"id": question['id'],
"expected": expected_files,
"found": found_files,
"matched": matched,
"recall": len(matched) / len(expected_files) if expected_files else 0,
"time_ms": result.total_time_ms,
}
def main():
# Debug CodeQL question
all_questions = load_questions(limit=12)
questions = [all_questions[4]] # codeql_dataflow_configuration (index 4)
print(f"Debug: {questions[0]['id']}")
searcher = SGRSearcherGemini(
model="gemini-2.5-flash-lite",
max_iterations=8,
verbose=True, # Full debug
)
results = []
for q in questions:
try:
result = test_question(searcher, q)
results.append(result)
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
results.append({"id": q['id'], "error": str(e)})
# Summary
print(f"\n{'='*80}")
print("SUMMARY")
print(f"{'='*80}")
total_recall = 0
total_time = 0
for r in results:
if 'error' not in r or r.get('recall'):
recall = r.get('recall', 0)
total_recall += recall
total_time += r.get('time_ms', 0)
print(f"{r['id']}: recall={recall:.0%}, time={r.get('time_ms', 0):.0f}ms")
else:
print(f"{r['id']}: ERROR - {r.get('error')}")
if results:
print(f"\nAverage recall: {total_recall/len(results):.0%}")
print(f"Average time: {total_time/len(results):.0f}ms")
if __name__ == "__main__":
main()