Skip to main content
Glama
duck-judge.ts9.51 kB
import { ProviderManager } from '../providers/manager.js'; import { DuckResponse, JudgeEvaluation } from '../config/types.js'; import { logger } from '../utils/logger.js'; export interface DuckJudgeArgs { responses: DuckResponse[]; judge?: string; criteria?: string[]; persona?: string; } interface ParsedJudgment { rankings: Array<{ provider: string; score: number; justification: string; }>; criteria_scores?: Record<string, Record<string, number>>; summary: string; } const DEFAULT_CRITERIA = ['accuracy', 'completeness', 'clarity']; export async function duckJudgeTool( providerManager: ProviderManager, args: Record<string, unknown> ) { const { responses, judge, criteria = DEFAULT_CRITERIA, persona, } = args as unknown as DuckJudgeArgs; // Validate inputs if (!responses || !Array.isArray(responses) || responses.length === 0) { throw new Error('At least one response is required to judge'); } if (responses.length === 1) { throw new Error('At least two responses are required for comparison'); } // Determine judge provider const judgeProvider = judge || providerManager.getProviderNames()[0]; if (!judgeProvider) { throw new Error('No judge provider available'); } logger.info(`Starting judgment with ${judgeProvider} on ${responses.length} responses`); // Build the judgment prompt const prompt = buildJudgePrompt(responses, criteria, persona); // Get judgment from the judge duck const judgeResponse = await providerManager.askDuck(judgeProvider, prompt); // Parse the judgment const evaluation = parseJudgment( judgeResponse.content, judgeResponse.provider, judgeResponse.nickname, responses, criteria ); // Format output const formattedOutput = formatJudgeResult(evaluation); logger.info( `Judgment completed by ${judgeProvider}: #1 is ${evaluation.rankings[0]?.provider || 'unknown'}` ); return { content: [ { type: 'text', text: formattedOutput, }, ], }; } function buildJudgePrompt( responses: DuckResponse[], criteria: string[], persona?: string ): string { const criteriaList = criteria.map((c, i) => `${i + 1}. ${c}`).join('\n'); const responsesText = responses.map((r, i) => `--- Response ${i + 1} (${r.nickname} / ${r.provider}) ---\n${r.content}\n` ).join('\n'); const personaText = persona ? `You are a ${persona} evaluating these responses.\n\n` : ''; return `${personaText}You are a judge evaluating ${responses.length} responses to the same prompt. RESPONSES TO EVALUATE: ${responsesText} EVALUATION CRITERIA: ${criteriaList} INSTRUCTIONS: 1. Evaluate each response against ALL criteria 2. Assign a score from 0-100 for each response 3. Rank responses from best to worst 4. Provide a brief justification for each ranking 5. Give a final summary Respond with ONLY a JSON object in this exact format: { "rankings": [ {"provider": "<provider name>", "score": <0-100>, "justification": "<brief explanation>"}, {"provider": "<provider name>", "score": <0-100>, "justification": "<brief explanation>"} ], "criteria_scores": { "<provider>": {${criteria.map(c => `"${c}": <0-100>`).join(', ')}} }, "summary": "<overall assessment and recommendation>" } IMPORTANT: - Rankings must be ordered from highest score to lowest - Use the exact provider names from the responses - Do NOT include any text before or after the JSON - Do NOT use markdown code blocks`; } function matchProvider( judgeProviderName: string, originalResponses: DuckResponse[] ): DuckResponse | undefined { const nameLower = judgeProviderName.toLowerCase(); // Try exact match first const exactMatch = originalResponses.find(r => r.provider.toLowerCase() === nameLower); if (exactMatch) return exactMatch; // Try matching by provider name contained in judge's response const containsMatch = originalResponses.find(r => nameLower.includes(r.provider.toLowerCase()) || nameLower.includes(r.nickname.toLowerCase()) ); if (containsMatch) return containsMatch; // Try matching by nickname const nicknameMatch = originalResponses.find(r => r.nickname.toLowerCase() === nameLower ); if (nicknameMatch) return nicknameMatch; return undefined; } function parseJudgment( response: string, judgeProvider: string, judgeNickname: string, originalResponses: DuckResponse[], criteria: string[] ): JudgeEvaluation { const evaluation: JudgeEvaluation = { judge: judgeProvider, judgeNickname: judgeNickname, prompt: '', // Will be filled by caller if needed criteria, rankings: [], criteriaScores: {}, summary: '', rawResponse: response, }; try { // Try to extract JSON from the response const jsonMatch = response.match(/\{[\s\S]*\}/); if (!jsonMatch) { logger.warn(`No JSON found in judge response from ${judgeProvider}`); return createFallbackEvaluation(evaluation, originalResponses, response); } const parsed = JSON.parse(jsonMatch[0]) as ParsedJudgment; const matchedProviders = new Set<string>(); // Parse rankings if (Array.isArray(parsed.rankings)) { for (const [index, r] of parsed.rankings.entries()) { const matched = matchProvider(r.provider, originalResponses); if (matched && !matchedProviders.has(matched.provider)) { matchedProviders.add(matched.provider); evaluation.rankings.push({ provider: matched.provider, nickname: matched.nickname, rank: index + 1, score: typeof r.score === 'number' ? Math.max(0, Math.min(100, r.score)) : 0, justification: r.justification?.toString() || '', }); } } } // Parse criteria scores if (parsed.criteria_scores && typeof parsed.criteria_scores === 'object') { evaluation.criteriaScores = parsed.criteria_scores; } // Parse summary if (parsed.summary) { evaluation.summary = parsed.summary.toString(); } } catch (error) { logger.warn(`Failed to parse JSON judgment from ${judgeProvider}:`, error); return createFallbackEvaluation(evaluation, originalResponses, response); } // Ensure all original responses are represented const rankedProviders = new Set(evaluation.rankings.map(r => r.provider)); for (const resp of originalResponses) { if (!rankedProviders.has(resp.provider)) { evaluation.rankings.push({ provider: resp.provider, nickname: resp.nickname, rank: evaluation.rankings.length + 1, score: 0, justification: 'Not evaluated by judge', }); } } return evaluation; } function createFallbackEvaluation( evaluation: JudgeEvaluation, originalResponses: DuckResponse[], rawResponse: string ): JudgeEvaluation { // Create a basic evaluation when parsing fails evaluation.rankings = originalResponses.map((r, index) => ({ provider: r.provider, nickname: r.nickname, rank: index + 1, score: 50, justification: 'Unable to parse judge response', })); evaluation.summary = `Judge evaluation parsing failed. Raw response available for review.`; evaluation.rawResponse = rawResponse; return evaluation; } function formatJudgeResult(evaluation: JudgeEvaluation): string { let output = `⚖️ **Judge Evaluation**\n`; output += `═══════════════════════════════════════\n\n`; output += `**Judge:** ${evaluation.judgeNickname} (${evaluation.judge})\n`; output += `**Criteria:** ${evaluation.criteria.join(', ')}\n\n`; // Rankings output += `**Rankings:**\n`; output += `─────────────────────────────────────\n`; for (const ranking of evaluation.rankings) { const medal = ranking.rank === 1 ? '🥇' : ranking.rank === 2 ? '🥈' : ranking.rank === 3 ? '🥉' : ' '; const bar = '█'.repeat(Math.floor(ranking.score / 10)); const emptyBar = '░'.repeat(10 - Math.floor(ranking.score / 10)); output += `${medal} **#${ranking.rank} ${ranking.nickname}** (${ranking.provider})\n`; output += ` Score: ${bar}${emptyBar} ${ranking.score}/100\n`; output += ` 💭 "${ranking.justification}"\n\n`; } // Criteria breakdown if available if (Object.keys(evaluation.criteriaScores).length > 0) { output += `**Criteria Breakdown:**\n`; output += `─────────────────────────────────────\n`; for (const [provider, scores] of Object.entries(evaluation.criteriaScores)) { output += `📊 **${provider}:**\n`; for (const [criterion, score] of Object.entries(scores)) { const criterionScore = typeof score === 'number' ? score : 0; output += ` • ${criterion}: ${criterionScore}/100\n`; } output += `\n`; } } // Summary if (evaluation.summary) { output += `**Summary:**\n`; output += `─────────────────────────────────────\n`; output += `${evaluation.summary}\n\n`; } output += `═══════════════════════════════════════\n`; output += `📋 Evaluated ${evaluation.rankings.length} responses\n`; return output; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/nesquikm/mcp-rubber-duck'

If you have feedback or need assistance with the MCP directory API, please join our Discord server