Kodit

kodit
src
benchmark

cli.py•32.4 KiB

"""Command line interface for kodit benchmarks.""" import json from pathlib import Path import click import structlog from benchmark.minisweagent.runner import MiniSweAgentRunner, RunConfig from benchmark.server import ServerProcess from benchmark.swebench.evaluator import ( EvaluationError, Evaluator, ) from benchmark.swebench.loader import DatasetLoader from benchmark.swebench.repository import ( DEFAULT_REPOS_DIR, ) from kodit.config import AppContext from kodit.log import configure_logging DEFAULT_OUTPUT_DIR = Path("benchmarks/data") def _extract_run_stats(output_dir: Path) -> dict: """Extract cost and token statistics from trajectory files.""" stats: dict = { "total_cost": 0.0, "total_api_calls": 0, "total_tokens_sent": 0, "total_tokens_received": 0, "instance_count": 0, "instances_with_patch": 0, "instance_stats": {}, } # Load predictions to check for patches predictions_path = output_dir / "preds.json" predictions = {} if predictions_path.exists(): with predictions_path.open() as f: predictions = json.load(f) # Extract stats from trajectory files for trajectory_path in output_dir.glob("*/*.traj.json"): try: with trajectory_path.open() as f: trajectory = json.load(f) default_id = trajectory_path.stem.replace(".traj", "") instance_id = trajectory.get("instance_id", default_id) info = trajectory.get("info", {}) model_stats = info.get("model_stats", {}) cost = model_stats.get("instance_cost", 0.0) api_calls = model_stats.get("api_calls", 0) tokens_sent = model_stats.get("tokens_sent", 0) tokens_received = model_stats.get("tokens_received", 0) stats["total_cost"] += cost stats["total_api_calls"] += api_calls stats["total_tokens_sent"] += tokens_sent stats["total_tokens_received"] += tokens_received stats["instance_count"] += 1 has_patch = bool( predictions.get(instance_id, {}).get("model_patch", "").strip() ) if has_patch: stats["instances_with_patch"] += 1 stats["instance_stats"][instance_id] = { "cost": cost, "api_calls": api_calls, "tokens_sent": tokens_sent, "tokens_received": tokens_received, "has_patch": has_patch, "exit_status": info.get("exit_status", "Unknown"), } except (json.JSONDecodeError, OSError): pass return stats def _load_evaluation_results( eval_path: Path | None, output_dir: Path, run_id: str, ) -> dict: """Load evaluation results from JSON file.""" log = structlog.get_logger(__name__) results: dict = { "resolved_ids": set(), "unresolved_ids": set(), "error_ids": set(), "empty_patch_ids": set(), "total": 0, "resolved": 0, } # Try to find evaluation file if eval_path and eval_path.exists(): path = eval_path else: # Look for evaluation results in common locations candidates = [ Path.cwd() / f"mini-swe-agent.{run_id}.json", output_dir / f"mini-swe-agent.{run_id}.json", Path.cwd() / f"unknown.{run_id}.json", ] candidates.extend(list(Path.cwd().glob(f"*.{run_id}.json"))) path = None for candidate in candidates: if candidate.exists(): path = candidate break if path and path.exists(): log.info("Found evaluation results", path=str(path)) with path.open() as f: data = json.load(f) results["resolved_ids"] = set(data.get("resolved_ids", [])) results["unresolved_ids"] = set(data.get("unresolved_ids", [])) results["error_ids"] = set(data.get("error_ids", [])) results["empty_patch_ids"] = set(data.get("empty_patch_ids", [])) results["total"] = data.get("total_instances", 0) results["resolved"] = data.get("resolved_instances", 0) else: log.warning("No evaluation results found", run_id=run_id) return results def _print_section(title: str, divider: str = "-") -> None: """Print a section header.""" click.echo("\n" + divider * 70) click.echo(title) click.echo(divider * 70) def _print_metric_row(label: str, baseline_val: str, kodit_val: str) -> None: """Print a single metric comparison row.""" click.echo(f"{label:<30} {baseline_val:>15} {kodit_val:>15}") def _display_comparison_report( baseline_stats: dict, kodit_stats: dict, baseline_results: dict, kodit_results: dict, output: Path, ) -> None: """Display formatted comparison report to terminal.""" # Compute instance sets b_resolved_ids = baseline_results["resolved_ids"] k_resolved_ids = kodit_results["resolved_ids"] both = b_resolved_ids & k_resolved_ids baseline_only = b_resolved_ids - k_resolved_ids kodit_only = k_resolved_ids - b_resolved_ids all_ids = ( baseline_results["resolved_ids"] | baseline_results["unresolved_ids"] | baseline_results["error_ids"] | baseline_results["empty_patch_ids"] | kodit_results["resolved_ids"] | kodit_results["unresolved_ids"] | kodit_results["error_ids"] | kodit_results["empty_patch_ids"] ) neither = all_ids - b_resolved_ids - k_resolved_ids # Header _print_section("BENCHMARK COMPARISON: BASELINE vs KODIT", "=") # Performance section _print_section("PERFORMANCE (Pass/Fail)") _print_metric_row("Metric", "Baseline", "Kodit") click.echo("-" * 70) b_total = baseline_results["total"] k_total = kodit_results["total"] b_res = baseline_results["resolved"] k_res = kodit_results["resolved"] b_rate = b_res / b_total if b_total > 0 else 0.0 k_rate = k_res / k_total if k_total > 0 else 0.0 _print_metric_row("Instances evaluated", str(b_total), str(k_total)) _print_metric_row("Resolved (passed)", str(b_res), str(k_res)) _print_metric_row("Resolve rate", f"{b_rate:.1%}", f"{k_rate:.1%}") # Instance breakdown _print_section("INSTANCE BREAKDOWN") click.echo(f"Both resolved: {len(both):>5}") click.echo(f"Baseline only resolved: {len(baseline_only):>5}") click.echo(f"Kodit only resolved: {len(kodit_only):>5}") click.echo(f"Neither resolved: {len(neither):>5}") # Cost section _print_section("COST & TOKEN USAGE") _print_metric_row("Metric", "Baseline", "Kodit") click.echo("-" * 70) b_cost = baseline_stats["total_cost"] k_cost = kodit_stats["total_cost"] _print_metric_row("Total cost", f"${b_cost:.4f}", f"${k_cost:.4f}") _print_metric_row( "Total API calls", f"{baseline_stats['total_api_calls']:,}", f"{kodit_stats['total_api_calls']:,}", ) _print_metric_row( "Tokens sent", f"{baseline_stats['total_tokens_sent']:,}", f"{kodit_stats['total_tokens_sent']:,}", ) _print_metric_row( "Tokens received", f"{baseline_stats['total_tokens_received']:,}", f"{kodit_stats['total_tokens_received']:,}", ) b_sent = baseline_stats["total_tokens_sent"] b_recv = baseline_stats["total_tokens_received"] k_sent = kodit_stats["total_tokens_sent"] k_recv = kodit_stats["total_tokens_received"] b_tokens = b_sent + b_recv k_tokens = k_sent + k_recv _print_metric_row("Total tokens", f"{b_tokens:,}", f"{k_tokens:,}") # Summary _print_section("SUMMARY", "=") _print_summary(baseline_only, kodit_only, b_res, k_res) click.echo(f"\nDetailed results saved to: {output}") click.echo("=" * 70) # Resolution differences _print_resolution_differences(baseline_only, kodit_only) def _print_summary( baseline_only: set, kodit_only: set, b_resolved: int, k_resolved: int ) -> None: """Print the summary section.""" improvement = len(kodit_only) - len(baseline_only) if improvement > 0: click.echo(f"Kodit resolved {improvement} more instance(s) than baseline") elif improvement < 0: click.echo(f"Baseline resolved {-improvement} more instance(s) than Kodit") else: click.echo("Both approaches resolved the same number of unique instances") if k_resolved > b_resolved: diff = k_resolved - b_resolved click.echo(f"Overall: Kodit has {diff} more total resolutions") elif b_resolved > k_resolved: diff = b_resolved - k_resolved click.echo(f"Overall: Baseline has {diff} more total resolutions") def _print_resolution_differences(baseline_only: set, kodit_only: set) -> None: """Print the resolution differences section.""" if not baseline_only and not kodit_only: return _print_section("RESOLUTION DIFFERENCES") if baseline_only: click.echo("\nBaseline resolved but Kodit did not:") for instance_id in sorted(baseline_only): click.echo(f" - {instance_id}") if kodit_only: click.echo("\nKodit resolved but Baseline did not:") for instance_id in sorted(kodit_only): click.echo(f" + {instance_id}") # Server defaults DEFAULT_HOST = "127.0.0.1" DEFAULT_PORT = 8765 DEFAULT_DB_PORT = 5432 # Enrichment defaults # Note: When using openrouter/* prefixed models, do NOT set base_url. # LiteLLM automatically routes to OpenRouter based on the model prefix. DEFAULT_ENRICHMENT_BASE_URL = "" DEFAULT_ENRICHMENT_PARALLEL_TASKS = 25 DEFAULT_ENRICHMENT_TIMEOUT = 60 DEFAULT_EMBEDDING_BASE_URL = "" DEFAULT_EMBEDDING_PARALLEL_TASKS = 25 DEFAULT_EMBEDDING_TIMEOUT = 60 # Model defaults DEFAULT_EMBEDDING_MODEL = "openrouter/mistralai/codestral-embed-2505" DEFAULT_SWE_AGENT_MODEL = "openrouter/anthropic/claude-haiku-4.5" DEFAULT_KODIT_ENRICHMENT_MODEL = "openrouter/mistralai/ministral-8b-2512" class MissingApiKeyError(click.ClickException): """Raised when ENRICHMENT_ENDPOINT_API_KEY is not set.""" message = ( "ENRICHMENT_ENDPOINT_API_KEY environment variable is required.\n" "Set it with: export ENRICHMENT_ENDPOINT_API_KEY=your-api-key" ) def __init__(self) -> None: """Initialize with the error message.""" super().__init__(self.message) class MissingEmbeddingApiKeyError(click.ClickException): """Raised when EMBEDDING_ENDPOINT_API_KEY is not set.""" message = ( "EMBEDDING_ENDPOINT_API_KEY environment variable is required.\n" "Set it with: export EMBEDDING_ENDPOINT_API_KEY=your-api-key" ) def __init__(self) -> None: """Initialize with the error message.""" super().__init__(self.message) def require_api_key(api_key: str | None) -> str: """Validate that API key is provided, raising an error if not.""" if not api_key: raise MissingApiKeyError return api_key def require_embedding_api_key(api_key: str | None) -> str: """Validate that embedding API key is provided, raising an error if not.""" if not api_key: raise MissingEmbeddingApiKeyError return api_key @click.group(context_settings={"max_content_width": 100}) def cli() -> None: """kodit-benchmark CLI - Benchmark Kodit's retrieval capabilities.""" configure_logging(AppContext()) @cli.command("download") @click.option( "--dataset", type=click.Choice(["lite", "verified"]), default="lite", help="SWE-bench dataset variant", ) @click.option( "--output", type=click.Path(path_type=Path), default=None, help="Output JSON file path (default: benchmarks/data/swebench-{variant}.json)", ) def download(dataset: str, output: Path | None) -> None: """Download SWE-bench dataset from HuggingFace and save as JSON.""" log = structlog.get_logger(__name__) if output is None: output = DEFAULT_OUTPUT_DIR / f"swebench-{dataset}.json" log.info("Downloading SWE-bench dataset", variant=dataset, output=str(output)) loader = DatasetLoader() instances = loader.download(dataset) loader.save(instances, output) log.info("Download complete", count=len(instances), output=str(output)) # ============================================================================ # Mini-swe-agent commands # ============================================================================ MINI_SWE_AGENT_OUTPUT_DIR = Path("benchmarks/minisweagent") MINI_SWE_AGENT_CONFIG_DIR = Path(__file__).parent / "minisweagent" / "configs" @cli.group("mini-swe-agent") def mini_swe_agent_group() -> None: """Mini-swe-agent benchmark commands for Kodit comparison.""" @mini_swe_agent_group.command("run-baseline") @click.option( "--dataset-file", type=click.Path(path_type=Path, exists=True), default=DEFAULT_OUTPUT_DIR / "swebench-verified.json", help="Path to SWE-bench dataset JSON file", ) @click.option( "--output-dir", type=click.Path(path_type=Path), default=MINI_SWE_AGENT_OUTPUT_DIR, help="Output directory for predictions and trajectories", ) @click.option( "--workers", default=1, type=int, help="Number of parallel workers", ) @click.option( "--limit", default=None, type=int, help="Limit number of instances to run (for testing)", ) @click.option( "--instance-id", default=None, type=str, help="Run only a specific instance by ID", ) @click.option( "--api-key", envvar="ENRICHMENT_ENDPOINT_API_KEY", help="LLM API key (or set ENRICHMENT_ENDPOINT_API_KEY)", ) @click.option( "--swe-agent-model", default=DEFAULT_SWE_AGENT_MODEL, help="LiteLLM model identifier for mini-swe-agent", ) @click.option( "--evaluate/--no-evaluate", default=True, help="Run SWE-bench evaluation after completion", ) @click.option( "--stream/--no-stream", default=True, help="Stream mini-swe-agent output to terminal instead of capturing", ) def mini_run_baseline( # noqa: PLR0913, PLR0915, C901 dataset_file: Path, output_dir: Path, workers: int, limit: int | None, instance_id: str | None, api_key: str | None, swe_agent_model: str, evaluate: bool, # noqa: FBT001 stream: bool, # noqa: FBT001 ) -> None: """Run mini-swe-agent baseline (without Kodit retrieval). This runs mini-swe-agent against SWE-bench instances with only the problem statement, providing a baseline for comparison. """ api_key = require_api_key(api_key) log = structlog.get_logger(__name__) # Load instances loader = DatasetLoader() instances = loader.load(dataset_file) if instance_id: instances = [i for i in instances if i.instance_id == instance_id] if not instances: click.echo(f"Instance not found: {instance_id}", err=True) raise SystemExit(1) log.info("Running single instance", instance_id=instance_id) elif limit: instances = instances[:limit] log.info("Limited instances", limit=limit) log.info( "Running mini-swe-agent baseline", instance_count=len(instances), workers=workers, ) # Create runner and config runner = MiniSweAgentRunner() config = RunConfig( config_path=MINI_SWE_AGENT_CONFIG_DIR / "baseline.yaml", output_dir=output_dir, model=swe_agent_model, workers=workers, api_key=api_key, stream_output=stream, ) result = runner.run_baseline(config, instances) click.echo("\n" + "=" * 60) click.echo("MINI-SWE-AGENT BASELINE COMPLETE") click.echo("=" * 60) click.echo(f"Total instances: {result.total_instances}") click.echo(f"Completed: {result.completed_instances}") click.echo(f"With patches: {result.instances_with_patch}") click.echo("-" * 60) click.echo(f"Total cost: ${result.total_cost:.4f}") click.echo(f"Total API calls: {result.total_api_calls}") click.echo("-" * 60) # Show per-instance stats if result.instance_stats: click.echo("\nPer-instance results:") for stat in result.instance_stats: patch_indicator = "✓" if stat.has_patch else "✗" click.echo( f" {patch_indicator} {stat.instance_id}: " f"{stat.exit_status} (${stat.cost:.4f}, {stat.api_calls} calls)" ) click.echo("-" * 60) click.echo(f"Predictions: {result.predictions_path}") click.echo("=" * 60) # Run evaluation if requested if evaluate and result.instances_with_patch > 0: click.echo("\n" + "=" * 60) click.echo("RUNNING SWE-BENCH EVALUATION") click.echo("=" * 60) # Convert predictions to JSONL format for SWE-bench jsonl_path = runner.convert_preds_to_jsonl(result.predictions_path) log.info("Converted predictions for evaluation", jsonl_path=str(jsonl_path)) # Run evaluation evaluator = Evaluator() try: eval_result = evaluator.evaluate_full( predictions_path=jsonl_path, dataset_name="princeton-nlp/SWE-bench_Verified", max_workers=workers, run_id="mini_swe_agent_baseline", ) click.echo("\n" + "-" * 60) click.echo("EVALUATION RESULTS") click.echo("-" * 60) click.echo(f"Total predictions: {eval_result.total_predictions}") click.echo(f"Resolved: {eval_result.resolved}") click.echo(f"Resolve rate: {eval_result.resolve_rate:.1%}") # Show per-instance evaluation results if eval_result.instance_results: click.echo("\nPer-instance evaluation:") for ir in eval_result.instance_results: status_indicator = "✓" if ir.status == "resolved" else "✗" click.echo(f" {status_indicator} {ir.instance_id}: {ir.status}") click.echo("=" * 60) except EvaluationError as e: log.exception("Evaluation failed", error=str(e)) click.echo(f"Evaluation failed: {e}", err=True) elif evaluate and result.instances_with_patch == 0: click.echo("\nSkipping evaluation: no instances produced patches", err=True) @mini_swe_agent_group.command("run-kodit") @click.option( "--dataset-file", type=click.Path(path_type=Path, exists=True), default=DEFAULT_OUTPUT_DIR / "swebench-verified.json", help="Path to SWE-bench dataset JSON file", ) @click.option( "--output-dir", type=click.Path(path_type=Path), default=MINI_SWE_AGENT_OUTPUT_DIR, help="Output directory for predictions and trajectories", ) @click.option( "--repos-dir", type=click.Path(path_type=Path), default=DEFAULT_REPOS_DIR, help="Directory to clone repositories into", ) @click.option( "--workers", default=1, type=int, help="Number of parallel workers", ) @click.option( "--top-k", default=10, type=int, help="Number of snippets to retrieve per instance", ) @click.option("--host", default=DEFAULT_HOST, help="Kodit server host") @click.option("--port", default=DEFAULT_PORT, type=int, help="Kodit server port") @click.option("--db-port", default=DEFAULT_DB_PORT, type=int, help="Database port") @click.option( "--enrichment-base-url", default=DEFAULT_ENRICHMENT_BASE_URL, help="Enrichment endpoint base URL", ) @click.option( "--kodit-enrichment-model", default=DEFAULT_KODIT_ENRICHMENT_MODEL, help="Enrichment model name", ) @click.option( "--enrichment-parallel-tasks", default=DEFAULT_ENRICHMENT_PARALLEL_TASKS, type=int, help="Number of parallel enrichment tasks", ) @click.option( "--enrichment-timeout", default=DEFAULT_ENRICHMENT_TIMEOUT, type=int, help="Enrichment request timeout in seconds", ) @click.option( "--embedding-base-url", default=DEFAULT_EMBEDDING_BASE_URL, help="Embedding endpoint base URL", ) @click.option( "--embedding-model", default=DEFAULT_EMBEDDING_MODEL, help="Embedding model name", ) @click.option( "--embedding-api-key", envvar="EMBEDDING_ENDPOINT_API_KEY", help="Embedding API key (or set EMBEDDING_ENDPOINT_API_KEY)", ) @click.option( "--embedding-parallel-tasks", default=DEFAULT_EMBEDDING_PARALLEL_TASKS, type=int, help="Number of parallel embedding tasks", ) @click.option( "--embedding-timeout", default=DEFAULT_EMBEDDING_TIMEOUT, type=int, help="Embedding request timeout in seconds", ) @click.option( "--limit", default=None, type=int, help="Limit number of instances to run (for testing)", ) @click.option( "--instance-id", default=None, type=str, help="Run only a specific instance by ID", ) @click.option( "--api-key", envvar="ENRICHMENT_ENDPOINT_API_KEY", help="LLM API key (or set ENRICHMENT_ENDPOINT_API_KEY)", ) @click.option( "--swe-agent-model", default=DEFAULT_SWE_AGENT_MODEL, help="LiteLLM model identifier for mini-swe-agent", ) @click.option( "--stream/--no-stream", default=True, help="Stream mini-swe-agent output to terminal instead of capturing", ) @click.option( "--evaluate/--no-evaluate", default=True, help="Run SWE-bench evaluation after completion", ) @click.option( "--force-reindex", is_flag=True, help="Force re-indexing even if cached augmented instances exist", ) def mini_run_kodit( # noqa: PLR0913, PLR0915, C901 dataset_file: Path, output_dir: Path, repos_dir: Path, workers: int, top_k: int, host: str, port: int, db_port: int, enrichment_base_url: str, kodit_enrichment_model: str, enrichment_parallel_tasks: int, enrichment_timeout: int, embedding_base_url: str, embedding_model: str, embedding_api_key: str | None, embedding_parallel_tasks: int, embedding_timeout: int, limit: int | None, instance_id: str | None, api_key: str | None, swe_agent_model: str, stream: bool, # noqa: FBT001 evaluate: bool, # noqa: FBT001 force_reindex: bool, # noqa: FBT001 ) -> None: """Run mini-swe-agent with Kodit retrieval. This runs mini-swe-agent against SWE-bench instances with problem statements augmented with Kodit-retrieved code context. If augmented instances have been cached from a previous run, the indexing and retrieval steps are skipped. Use --force-reindex to regenerate. For each instance, this command: 1. Starts the Kodit server and database 2. Clones the repository at the exact commit 3. Indexes it with Kodit and waits for completion 4. Retrieves relevant code snippets 5. Augments the problem statement with the context 6. Runs mini-swe-agent with the augmented problem statement 7. Stops the Kodit server """ api_key = require_api_key(api_key) embedding_api_key = require_embedding_api_key(embedding_api_key) log = structlog.get_logger(__name__) # Load instances loader = DatasetLoader() instances = loader.load(dataset_file) if instance_id: instances = [i for i in instances if i.instance_id == instance_id] if not instances: click.echo(f"Instance not found: {instance_id}", err=True) raise SystemExit(1) log.info("Running single instance", instance_id=instance_id) elif limit: instances = instances[:limit] log.info("Limited instances", limit=limit) log.info( "Running mini-swe-agent with Kodit", instance_count=len(instances), workers=workers, top_k=top_k, repos_dir=str(repos_dir), ) # Helper to create a fresh server for each instance def create_server() -> ServerProcess: return ServerProcess( host=host, port=port, db_port=db_port, enrichment_base_url=enrichment_base_url, enrichment_model=kodit_enrichment_model, enrichment_api_key=api_key, enrichment_parallel_tasks=enrichment_parallel_tasks, enrichment_timeout=enrichment_timeout, embedding_base_url=embedding_base_url, embedding_model=embedding_model, embedding_api_key=embedding_api_key, embedding_parallel_tasks=embedding_parallel_tasks, embedding_timeout=embedding_timeout, ) base_url = f"http://{host}:{port}" # Create runner and config runner = MiniSweAgentRunner(kodit_base_url=base_url, top_k=top_k) config = RunConfig( config_path=MINI_SWE_AGENT_CONFIG_DIR / "kodit.yaml", output_dir=output_dir, model=swe_agent_model, repos_dir=repos_dir, workers=workers, api_key=api_key, stream_output=stream, force_reindex=force_reindex, ) # Process each instance with fresh server start/stop result = runner.run_with_kodit_per_instance( config=config, instances=instances, server_factory=create_server, ) click.echo("\n" + "=" * 60) click.echo("MINI-SWE-AGENT WITH KODIT COMPLETE") click.echo("=" * 60) click.echo(f"Total instances: {result.total_instances}") click.echo(f"Completed: {result.completed_instances}") click.echo(f"With patches: {result.instances_with_patch}") click.echo("-" * 60) click.echo(f"Total cost: ${result.total_cost:.4f}") click.echo(f"Total API calls: {result.total_api_calls}") click.echo("-" * 60) # Show per-instance stats if result.instance_stats: click.echo("\nPer-instance results:") for stat in result.instance_stats: patch_indicator = "✓" if stat.has_patch else "✗" click.echo( f" {patch_indicator} {stat.instance_id}: " f"{stat.exit_status} (${stat.cost:.4f}, {stat.api_calls} calls)" ) click.echo("-" * 60) click.echo(f"Predictions: {result.predictions_path}") click.echo("=" * 60) # Run evaluation if requested if evaluate and result.instances_with_patch > 0: click.echo("\n" + "=" * 60) click.echo("RUNNING SWE-BENCH EVALUATION") click.echo("=" * 60) # Convert predictions to JSONL format for SWE-bench jsonl_path = runner.convert_preds_to_jsonl(result.predictions_path) log.info("Converted predictions for evaluation", jsonl_path=str(jsonl_path)) # Run evaluation evaluator = Evaluator() try: eval_result = evaluator.evaluate_full( predictions_path=jsonl_path, dataset_name="princeton-nlp/SWE-bench_Verified", max_workers=workers, run_id="mini_swe_agent_kodit", ) click.echo("\n" + "-" * 60) click.echo("EVALUATION RESULTS") click.echo("-" * 60) click.echo(f"Total predictions: {eval_result.total_predictions}") click.echo(f"Resolved: {eval_result.resolved}") click.echo(f"Resolve rate: {eval_result.resolve_rate:.1%}") # Show per-instance evaluation results if eval_result.instance_results: click.echo("\nPer-instance evaluation:") for ir in eval_result.instance_results: status_indicator = "✓" if ir.status == "resolved" else "✗" click.echo(f" {status_indicator} {ir.instance_id}: {ir.status}") click.echo("=" * 60) except EvaluationError as e: log.exception("Evaluation failed", error=str(e)) click.echo(f"Evaluation failed: {e}", err=True) elif evaluate and result.instances_with_patch == 0: click.echo("\nSkipping evaluation: no instances produced patches", err=True) @mini_swe_agent_group.command("compare") @click.option( "--baseline-dir", type=click.Path(path_type=Path, exists=True), default=MINI_SWE_AGENT_OUTPUT_DIR / "baseline", help="Path to baseline output directory", ) @click.option( "--kodit-dir", type=click.Path(path_type=Path, exists=True), default=MINI_SWE_AGENT_OUTPUT_DIR / "kodit", help="Path to Kodit output directory", ) @click.option( "--baseline-eval", type=click.Path(path_type=Path, exists=True), default=None, help="Path to baseline evaluation JSON (auto-detected if not specified)", ) @click.option( "--kodit-eval", type=click.Path(path_type=Path, exists=True), default=None, help="Path to Kodit evaluation JSON (auto-detected if not specified)", ) @click.option( "--output", type=click.Path(path_type=Path), default=MINI_SWE_AGENT_OUTPUT_DIR / "comparison.json", help="Output JSON file for comparison results", ) def mini_compare( baseline_dir: Path, kodit_dir: Path, baseline_eval: Path | None, kodit_eval: Path | None, output: Path, ) -> None: """Compare baseline and Kodit mini-swe-agent results. Compares pass/fail rates, total costs, and token usage between the baseline and Kodit-augmented approaches. Requires evaluation results to have been generated (run with --evaluate). """ log = structlog.get_logger(__name__) log.info( "Comparing results", baseline_dir=str(baseline_dir), kodit_dir=str(kodit_dir), ) # Extract stats from both directories baseline_stats = _extract_run_stats(baseline_dir) kodit_stats = _extract_run_stats(kodit_dir) # Load evaluation results baseline_results = _load_evaluation_results( baseline_eval, baseline_dir, "mini_swe_agent_baseline" ) kodit_results = _load_evaluation_results( kodit_eval, kodit_dir, "mini_swe_agent_kodit" ) # Compute comparison metrics baseline_resolved = baseline_results["resolved_ids"] kodit_resolved = kodit_results["resolved_ids"] both_resolved = baseline_resolved & kodit_resolved baseline_only = baseline_resolved - kodit_resolved kodit_only = kodit_resolved - baseline_resolved all_instances = ( baseline_results["resolved_ids"] | baseline_results["unresolved_ids"] | baseline_results["error_ids"] | baseline_results["empty_patch_ids"] | kodit_results["resolved_ids"] | kodit_results["unresolved_ids"] | kodit_results["error_ids"] | kodit_results["empty_patch_ids"] ) neither = all_instances - baseline_resolved - kodit_resolved instance_sets = { "both": both_resolved, "baseline_only": baseline_only, "kodit_only": kodit_only, "neither": neither, } # Build comparison data for JSON output comparison = _build_comparison_dict( baseline_stats, kodit_stats, baseline_results, kodit_results, instance_sets, ) # Write comparison JSON output.parent.mkdir(parents=True, exist_ok=True) with output.open("w") as f: json.dump(comparison, f, indent=2) # Display formatted report _display_comparison_report( baseline_stats, kodit_stats, baseline_results, kodit_results, output, ) def _build_run_summary(stats: dict, results: dict) -> dict: """Build summary dict for a single run.""" total = results["total"] return { "instances_evaluated": total, "resolved": results["resolved"], "resolve_rate": results["resolved"] / total if total > 0 else 0.0, "total_cost": stats["total_cost"], "total_api_calls": stats["total_api_calls"], "total_tokens_sent": stats["total_tokens_sent"], "total_tokens_received": stats["total_tokens_received"], "total_tokens": stats["total_tokens_sent"] + stats["total_tokens_received"], } def _build_comparison_dict( baseline_stats: dict, kodit_stats: dict, baseline_results: dict, kodit_results: dict, instance_sets: dict, ) -> dict: """Build comparison dictionary for JSON output.""" both = instance_sets["both"] baseline_only = instance_sets["baseline_only"] kodit_only = instance_sets["kodit_only"] neither = instance_sets["neither"] return { "summary": { "baseline": _build_run_summary(baseline_stats, baseline_results), "kodit": _build_run_summary(kodit_stats, kodit_results), "comparison": { "both_resolved": len(both), "baseline_only_resolved": len(baseline_only), "kodit_only_resolved": len(kodit_only), "neither_resolved": len(neither), "kodit_improvement": len(kodit_only) - len(baseline_only), }, }, "instances": { "both_resolved": sorted(both), "baseline_only_resolved": sorted(baseline_only), "kodit_only_resolved": sorted(kodit_only), "neither_resolved": sorted(neither), }, } if __name__ == "__main__": cli()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/helixml/kodit'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

cli.py•32.4 KiB