compare_models
Compare 2-5 LLM/VLM models side-by-side for pricing, benchmarks, and capabilities. Returns a compact Markdown comparison table to help select the right model.
Instructions
Compare 2-5 LLM/VLM models side-by-side: pricing, benchmarks, capabilities. Returns a compact Markdown comparison table (~400 tokens).
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| models | Yes | Model IDs or partial names (e.g., ["claude-sonnet-4.6", "gpt-5.2", "gemini-3-pro"]) |
Implementation Reference
- src/tools/compare.ts:24-64 (handler)The main handler function for compare_models tool. Resolves model queries from registry, validates at least 2 models found, and returns a formatted Markdown comparison table. Returns error if fewer than 2 models can be resolved.
async ({ models: modelQueries }) => { await registry.ensureLoaded(); const resolved: UnifiedModel[] = []; const notFound: string[] = []; for (const query of modelQueries) { const found = registry.getModel(query); if (found) { resolved.push(found); } else { notFound.push(query); } } if (resolved.length < 2) { const similar = notFound .flatMap((q) => registry.findSimilar(q)) .slice(0, 5); return { content: [ { type: "text" as const, text: `Need at least 2 models to compare. Not found: ${notFound.join(", ")}.` + (similar.length > 0 ? ` Did you mean: ${similar.join(", ")}?` : ""), }, ], isError: true, }; } const fetchedAt = registry.getCacheFreshnessMs(); const output = formatComparison(resolved, notFound, fetchedAt); return { content: [{ type: "text" as const, text: output }], }; } - src/tools/compare.ts:16-23 (schema)Zod schema for compare_models input: requires 'models' array of 2-5 strings (model IDs or partial names).
models: z .array(z.string()) .min(2) .max(5) .describe( 'Model IDs or partial names (e.g., ["claude-sonnet-4.6", "gpt-5.2", "gemini-3-pro"])' ), }, - src/tools/compare.ts:7-66 (registration)registerCompareTool function that registers the 'compare_models' tool with the MCP server, including name, description, schema, and handler.
export function registerCompareTool( server: McpServer, registry: ModelRegistry ): void { server.tool( "compare_models", "Compare 2-5 LLM/VLM models side-by-side: pricing, benchmarks, capabilities. " + "Returns a compact Markdown comparison table (~400 tokens).", { models: z .array(z.string()) .min(2) .max(5) .describe( 'Model IDs or partial names (e.g., ["claude-sonnet-4.6", "gpt-5.2", "gemini-3-pro"])' ), }, async ({ models: modelQueries }) => { await registry.ensureLoaded(); const resolved: UnifiedModel[] = []; const notFound: string[] = []; for (const query of modelQueries) { const found = registry.getModel(query); if (found) { resolved.push(found); } else { notFound.push(query); } } if (resolved.length < 2) { const similar = notFound .flatMap((q) => registry.findSimilar(q)) .slice(0, 5); return { content: [ { type: "text" as const, text: `Need at least 2 models to compare. Not found: ${notFound.join(", ")}.` + (similar.length > 0 ? ` Did you mean: ${similar.join(", ")}?` : ""), }, ], isError: true, }; } const fetchedAt = registry.getCacheFreshnessMs(); const output = formatComparison(resolved, notFound, fetchedAt); return { content: [{ type: "text" as const, text: output }], }; } ); } - src/tools/compare.ts:68-173 (helper)Helper functions for compare_models: formatComparison builds the Markdown table, row creates table rows, highlightBest bolds the highest numeric values in benchmark comparisons.
function formatComparison( models: UnifiedModel[], notFound: string[], fetchedAt?: number ): string { const lines: string[] = []; lines.push(`## Model Comparison (${models.length} models)`); if (notFound.length > 0) { lines.push(`\n> Not found: ${notFound.join(", ")}`); } lines.push(""); // Header row: Feature | Model1 | Model2 | ... const header = `| | ${models.map((m) => `**${m.id}**`).join(" | ")} |`; const sep = `|------|${models.map(() => "------").join("|")}|`; const rows: string[] = []; // Pricing rows.push(row("Input $/1M", models.map((m) => fmtPrice(m.pricing.input)))); rows.push(row("Output $/1M", models.map((m) => fmtPrice(m.pricing.output)))); if (models.some((m) => m.pricing.cacheRead !== undefined)) { rows.push( row("Cache Read $/1M", models.map((m) => fmtPrice(m.pricing.cacheRead))) ); } // Context & output rows.push( row("Context", models.map((m) => fmtContext(m.capabilities.contextLength))) ); rows.push( row( "Max Output", models.map((m) => fmtContext(m.capabilities.maxOutputTokens)) ) ); // Benchmarks — only show rows where at least one model has data const benchmarks: [string, (m: UnifiedModel) => string][] = [ ["SWE-bench", (m) => fmtScore(m.benchmarks.sweBenchVerified)], ["Aider Polyglot", (m) => fmtScore(m.benchmarks.aiderPolyglot)], ["Arena Elo", (m) => fmtElo(m.benchmarks.arenaElo)], ["MMLU-Pro", (m) => fmtScore(m.benchmarks.mmluPro)], ["GPQA Diamond", (m) => fmtScore(m.benchmarks.gpqaDiamond)], ["MATH-500", (m) => fmtScore(m.benchmarks.math500)], ["MMMU", (m) => fmtScore(m.benchmarks.mmmu)], ]; for (const [label, extractor] of benchmarks) { const values = models.map(extractor); if (values.some((v) => v !== "n/a")) { // Bold the best value rows.push(row(label, highlightBest(values))); } } // Capabilities rows.push( row("Vision", models.map((m) => m.capabilities.inputModalities.includes("image") ? "Yes" : "No" )) ); rows.push( row("Tools", models.map((m) => m.capabilities.supportsTools ? "Yes" : "No" )) ); rows.push( row("Reasoning", models.map((m) => m.capabilities.supportsReasoning ? "Yes" : "No" )) ); rows.push( row("Open Source", models.map((m) => m.metadata.isOpenSource ? "Yes" : "No" )) ); rows.push( row("Released", models.map((m) => m.metadata.releaseDate ?? "n/a")) ); lines.push(header); lines.push(sep); lines.push(...rows); lines.push(freshnessFooter(fetchedAt)); return lines.join("\n"); } function row(label: string, values: string[]): string { return `| ${label} | ${values.join(" | ")} |`; } /** Bold the best (highest) numeric value in the array */ function highlightBest(values: string[]): string[] { const nums = values.map((v) => { const n = parseFloat(v.replace(/[^0-9.\-]/g, "")); return isNaN(n) ? -Infinity : n; }); const max = Math.max(...nums); if (max === -Infinity) return values; return values.map((v, i) => (nums[i] === max && v !== "n/a" ? `**${v}**` : v)); } - src/tools/formatters.ts:23-62 (helper)Formatter utilities used by compare_models: fmtPrice ($X.XX format), fmtContext (128K/1M format), fmtScore (percentage), fmtElo (integer), and freshnessFooter (data age timestamp).
/** Format price as "$X.XX" or "free" */ export function fmtPrice(price: number | undefined): string { if (price === undefined || price === null) return "n/a"; if (price === 0) return "free"; if (price < 0.01) return `$${price.toFixed(4)}`; return `$${price.toFixed(2)}`; } /** Format large context lengths: 128000 → "128K", 1000000 → "1M" */ export function fmtContext(tokens: number | undefined): string { if (!tokens) return "n/a"; if (tokens >= 1_000_000) return `${(tokens / 1_000_000).toFixed(tokens % 1_000_000 === 0 ? 0 : 1)}M`; if (tokens >= 1_000) return `${Math.round(tokens / 1_000)}K`; return String(tokens); } /** Format benchmark score: 72.1 → "72.1%" or "n/a" */ export function fmtScore(score: number | undefined): string { if (score === undefined || score === null) return "n/a"; return `${score.toFixed(1)}%`; } /** Format Elo rating */ export function fmtElo(elo: number | undefined): string { if (elo === undefined || elo === null) return "n/a"; return String(Math.round(elo)); } /** Format modalities as short string: ["text", "image"] → "text+image" */ export function fmtModalities(mods: string[]): string { return mods.join("+") || "text"; } /** Data freshness footer */ export function freshnessFooter(fetchedAt?: number): string { if (!fetchedAt) return ""; const date = new Date(fetchedAt).toISOString().replace(/\.\d{3}Z$/, "Z"); const ageMin = Math.round((Date.now() - fetchedAt) / 60_000); return `\n**Data freshness**: ${date} (${ageMin}min ago)`; }