#!/usr/bin/env python3
"""
Run SGR Searcher with detailed logging for comparison with manual search.
Saves all LLM reasoning steps and tool calls to log files.
"""
import json
import os
import sys
import time
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List
# Add src to path
sys.path.insert(0, str(Path(__file__).parent / "src"))
from searchers import SGRSearcherGemini
# Configuration
REPO_BASE = Path(__file__).parent / "data" / "issues"
DATASET_PATH = Path(__file__).parent / "data" / "dataset.jsonl"
LOGS_DIR = Path(__file__).parent / "logs" / "sgr_runs"
class DetailedLogger:
"""Captures all SGR steps for analysis."""
def __init__(self, question_id: str):
self.question_id = question_id
self.steps: List[Dict[str, Any]] = []
self.start_time = time.time()
def log_step(self, step_name: str, data: Dict[str, Any]):
"""Log a step with timestamp."""
self.steps.append({
"timestamp": time.time() - self.start_time,
"step": step_name,
"data": data,
})
def save(self, output_dir: Path):
"""Save logs to file."""
output_dir.mkdir(parents=True, exist_ok=True)
log_file = output_dir / f"{self.question_id}_detailed.json"
with open(log_file, 'w') as f:
json.dump({
"question_id": self.question_id,
"total_time_seconds": time.time() - self.start_time,
"steps": self.steps,
}, f, indent=2, default=str)
# Also save human-readable version
md_file = output_dir / f"{self.question_id}_steps.md"
with open(md_file, 'w') as f:
f.write(self._to_markdown())
return log_file, md_file
def _to_markdown(self) -> str:
"""Convert logs to readable markdown."""
lines = [
f"# SGR Search Log: {self.question_id}",
f"\nTotal time: {time.time() - self.start_time:.1f}s",
f"\n---\n",
]
for i, step in enumerate(self.steps, 1):
lines.append(f"\n## Step {i}: {step['step']} (+{step['timestamp']:.1f}s)")
lines.append("")
data = step['data']
if isinstance(data, dict):
for key, value in data.items():
if isinstance(value, str) and len(value) > 500:
value = value[:500] + "..."
elif isinstance(value, (list, dict)):
value = json.dumps(value, indent=2, default=str)
if len(value) > 1000:
value = value[:1000] + "\n..."
lines.append(f"**{key}:**")
lines.append(f"```")
lines.append(str(value))
lines.append(f"```")
lines.append("")
else:
lines.append(f"```\n{data}\n```")
return "\n".join(lines)
class InstrumentedSGRSearcher(SGRSearcherGemini):
"""SGR Searcher with detailed logging of all steps."""
def __init__(self, logger: DetailedLogger, **kwargs):
self.detailed_logger = logger
super().__init__(**kwargs)
def _log(self, message: str) -> None:
"""Override to capture all logs."""
super()._log(message)
# Also capture to detailed logger
self.detailed_logger.log_step("log", {"message": message})
def _call_structured(self, messages, response_format):
"""Override to capture LLM calls."""
# Log the prompt
self.detailed_logger.log_step("llm_call_start", {
"response_format": response_format.__name__,
"messages": [
{
"role": m.get("role"),
"content": m.get("content", "")[:2000] + ("..." if len(m.get("content", "")) > 2000 else ""),
}
for m in messages
],
})
# Make the call
result = super()._call_structured(messages, response_format)
# Log the response
self.detailed_logger.log_step("llm_response", {
"response_format": response_format.__name__,
"result": result.model_dump() if hasattr(result, 'model_dump') else str(result),
})
return result
def _call_llm_text(self, messages):
"""Override to capture text LLM calls (for path correction etc)."""
# Log the prompt
self.detailed_logger.log_step("llm_text_call_start", {
"messages": [
{
"role": m.get("role"),
"content": m.get("content", "")[:1000] + ("..." if len(m.get("content", "")) > 1000 else ""),
}
for m in messages
],
})
# Make the call
result = super()._call_llm_text(messages)
# Log the response
self.detailed_logger.log_step("llm_text_response", {
"result": result[:500] + ("..." if len(result) > 500 else ""),
})
return result
def _analyze_repo_context(self, repo_path, tools):
"""Override to log repo analysis."""
self.detailed_logger.log_step("analyze_repo_start", {
"repo_path": repo_path,
})
result = super()._analyze_repo_context(repo_path, tools)
self.detailed_logger.log_step("analyze_repo_result", {
"primary_language": result.primary_language,
"framework": result.framework,
"project_type": result.project_type,
"key_directories": result.key_directories,
})
return result
def _analyze_question(self, query, repo_hint):
"""Override to log question analysis."""
self.detailed_logger.log_step("analyze_question_start", {
"query": query,
"repo_hint": repo_hint,
})
result = super()._analyze_question(query, repo_hint)
self.detailed_logger.log_step("analyze_question_result", {
"question_type": result.question_type,
"concepts": result.concepts,
"user_problem": result.user_problem,
"what_to_find": result.what_to_find,
})
return result
def load_all_questions() -> List[Dict]:
"""Load all questions from dataset."""
questions = []
with open(DATASET_PATH) as f:
for line in f:
if line.strip():
questions.append(json.loads(line))
return questions
def run_single_question(question: Dict, output_dir: Path) -> Dict:
"""Run SGR on a single question with detailed logging."""
question_id = question['id']
repo_path = REPO_BASE / question['repo_path'] / "repo"
print(f"\n{'='*80}")
print(f"Question: {question_id}")
print(f"Repo: {question['repo_path']}")
print(f"Query: {question['query'][:100]}...")
print(f"{'='*80}")
if not repo_path.exists():
print(f"ERROR: Repo not found at {repo_path}")
return {"id": question_id, "error": "repo not found"}
# Create logger
logger = DetailedLogger(question_id)
logger.log_step("start", {
"question": question['query'],
"repo_path": str(repo_path),
"expected_files": [item['file_path'] for item in question.get('expected_items', [])],
})
# Create instrumented searcher
searcher = InstrumentedSGRSearcher(
logger=logger,
model="gemini-2.5-flash-lite", # Light model as requested
max_iterations=8,
verbose=True,
)
# Run search
try:
result = searcher.search(
query=question['query'],
repo_path=str(repo_path),
)
# Log final result
logger.log_step("final_result", {
"found_files": [item.file_path for item in result.items],
"patterns_used": result.patterns_used,
"total_time_ms": result.total_time_ms,
"error": result.error,
})
except Exception as e:
import traceback
logger.log_step("error", {
"error": str(e),
"traceback": traceback.format_exc(),
})
result = None
# Save logs
log_file, md_file = logger.save(output_dir)
print(f"\nLogs saved to:\n {log_file}\n {md_file}")
# Compare with expected
expected_files = [item['file_path'] for item in question.get('expected_items', [])]
found_files = [item.file_path for item in result.items] if result else []
matched = []
for expected in expected_files:
for found in found_files:
if expected in found or found in expected:
matched.append(expected)
break
recall = len(matched) / len(expected_files) if expected_files else 0
print(f"\n--- RESULTS ---")
print(f"Expected: {expected_files}")
print(f"Found: {found_files[:5]}")
print(f"Matched: {matched}")
print(f"Recall: {recall:.0%}")
return {
"id": question_id,
"expected": expected_files,
"found": found_files,
"matched": matched,
"recall": recall,
"time_ms": result.total_time_ms if result else 0,
"error": result.error if result else "exception",
}
def main():
"""Run SGR on all questions and save detailed logs."""
# Create output directory with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = LOGS_DIR / timestamp
output_dir.mkdir(parents=True, exist_ok=True)
print(f"Output directory: {output_dir}")
# Load questions
questions = load_all_questions()
print(f"Loaded {len(questions)} questions")
# Run on all questions
# questions = questions[:2] # Uncomment to test with first 2 questions only
results = []
for question in questions:
try:
result = run_single_question(question, output_dir)
results.append(result)
except Exception as e:
print(f"ERROR processing {question['id']}: {e}")
import traceback
traceback.print_exc()
results.append({"id": question['id'], "error": str(e)})
# Save summary
summary_file = output_dir / "summary.json"
with open(summary_file, 'w') as f:
json.dump(results, f, indent=2)
# Print summary
print(f"\n{'='*80}")
print("SUMMARY")
print(f"{'='*80}")
total_recall = 0
total_time = 0
successful = 0
for r in results:
recall = r.get('recall', 0)
time_ms = r.get('time_ms', 0)
if 'error' not in r or r.get('recall'):
total_recall += recall
total_time += time_ms
successful += 1
print(f"{r['id']}: recall={recall:.0%}, time={time_ms:.0f}ms")
else:
print(f"{r['id']}: ERROR - {r.get('error')}")
if successful > 0:
print(f"\nSuccessful: {successful}/{len(results)}")
print(f"Average recall: {total_recall/successful:.0%}")
print(f"Average time: {total_time/successful:.0f}ms")
print(f"\nLogs saved to: {output_dir}")
# Create comparison report
create_comparison_report(results, output_dir)
def create_comparison_report(results: List[Dict], output_dir: Path):
"""Create a comparison report with my manual search."""
# My results from manual search
my_results = {
"django_request_response_lifecycle": {"recall": 0.25, "files_found": ["django/core/handlers/base.py"]},
"django_queryset_lazy_evaluation": {"recall": 1.0, "files_found": ["django/db/models/query.py"]},
"gin_route_tree_matching": {"recall": 0.75, "files_found": ["tree.go"]},
"gin_context_pool_reuse": {"recall": 1.0, "files_found": ["gin.go", "context.go"]},
"codeql_dataflow_configuration": {"recall": 0.30, "files_found": ["shared/dataflow/.../DataFlowImpl.qll", "Xss.qll"]},
"codeql_security_query_structure": {"recall": 0.70, "files_found": ["DomBasedXssQuery.qll", "Xss.qll"]},
"infinigen_procedural_material_nodes": {"recall": 0.70, "files_found": ["node_wrangler.py", "surface.py"]},
"infinigen_asset_placement_pipeline": {"recall": 0.75, "files_found": ["placement.py", "density.py", "instance_scatter.py"]},
"qgis_map_rendering_pipeline": {"recall": 0.80, "files_found": ["qgsmaprendererparalleljob.h", "qgsmaprendererjob.h", "qgsmapcanvas.h"]},
"qgis_expression_evaluation": {"recall": 0.70, "files_found": ["qgsexpression.h", "qgspallabeling.h"]},
"claude_flow_agent_coordination": {"recall": 0.40, "files_found": ["task/coordination.ts"]},
"claude_flow_memory_system": {"recall": 0.30, "files_found": ["hive-mind/core/Memory.ts", "DatabaseManager.ts"]},
}
report_lines = [
"# SGR vs Manual Search Comparison",
f"\nGenerated: {datetime.now().isoformat()}",
"\n## Results Comparison\n",
"| Question | SGR Recall | My Recall | Winner | SGR Files | My Files |",
"|----------|------------|-----------|--------|-----------|----------|",
]
sgr_wins = 0
my_wins = 0
ties = 0
for r in results:
q_id = r['id']
sgr_recall = r.get('recall', 0)
my_data = my_results.get(q_id, {"recall": 0, "files_found": []})
my_recall = my_data['recall']
if sgr_recall > my_recall:
winner = "**SGR**"
sgr_wins += 1
elif my_recall > sgr_recall:
winner = "**Manual**"
my_wins += 1
else:
winner = "Tie"
ties += 1
sgr_files = ", ".join(r.get('found', [])[:3])[:50]
my_files = ", ".join(my_data['files_found'][:3])[:50]
report_lines.append(
f"| {q_id[:30]} | {sgr_recall:.0%} | {my_recall:.0%} | {winner} | {sgr_files}... | {my_files}... |"
)
report_lines.extend([
"\n## Summary",
f"\n- SGR wins: {sgr_wins}",
f"- Manual wins: {my_wins}",
f"- Ties: {ties}",
"\n## Analysis",
"\n### Where SGR does better:",
"- (will be filled based on results)",
"\n### Where Manual search does better:",
"- (will be filled based on results)",
"\n### Common failure patterns:",
"- (will be analyzed from logs)",
])
report_file = output_dir / "comparison_report.md"
with open(report_file, 'w') as f:
f.write("\n".join(report_lines))
print(f"\nComparison report saved to: {report_file}")
if __name__ == "__main__":
main()