Skip to main content
Glama
base-comparative.ts10.4 kB
/** * Base Comparative Evaluator * * Shared functionality for comparing multiple AI models across scenarios * Eliminates code duplication between remediation, recommendation, and capability evaluators */ import { EvaluationScore } from './base.js'; import { VercelProvider } from '../../core/providers/vercel-provider'; import { getCurrentModel } from '../../core/model-config'; import { extractJsonFromAIResponse } from '../../core/platform-utils'; import { readFileSync } from 'fs'; import { join } from 'path'; import { DatasetAnalyzer, ComparisonScenario } from '../dataset-analyzer.js'; import { loadEvaluationMetadata, buildModelPricingContext, buildToolContext, type EvaluationMetadata } from '../metadata-loader.js'; export interface ComparativeEvaluationResult { scenario_summary: string; models_compared: string[]; comparative_analysis: Record<string, { quality_score?: number; efficiency_score?: number; performance_score?: number; communication_score?: number; accuracy_score?: number; completeness_score?: number; clarity_score?: number; consistency_score?: number; weighted_total: number; strengths: string; weaknesses: string; }>; ranking: Array<{ rank: number; model: string; score: number; rationale?: string; reasoning?: string; }>; overall_insights?: string; } export interface ComparativeEvaluationScore extends EvaluationScore { modelRankings: Array<{ rank: number; model: string; score: number; }>; bestModel: string; modelCount: number; } export abstract class BaseComparativeEvaluator { abstract readonly name: string; abstract readonly description: string; protected abstract readonly promptFileName: string; protected abstract readonly toolName: string; protected evaluatorModel: VercelProvider; protected datasetAnalyzer: DatasetAnalyzer; protected promptTemplate: string; protected metadata: EvaluationMetadata; constructor(datasetDir?: string) { // Use Claude via VercelProvider as the evaluator (most reliable for complex comparative evaluation) this.evaluatorModel = new VercelProvider({ provider: 'anthropic', apiKey: process.env.ANTHROPIC_API_KEY!, model: getCurrentModel('anthropic'), debugMode: process.env.DEBUG_DOT_AI === 'true' }); this.datasetAnalyzer = new DatasetAnalyzer(datasetDir || './eval/datasets'); // Prompt template will be loaded by subclass this.promptTemplate = ''; // Load metadata this.metadata = loadEvaluationMetadata(); } /** * Initialize the evaluator - must be called by subclass constructor */ protected initializePrompt() { const promptPath = join(process.cwd(), 'src', 'evaluation', 'prompts', this.promptFileName); this.promptTemplate = readFileSync(promptPath, 'utf8'); } /** * Evaluate all available models for scenarios * This method finds all scenarios with multiple model responses and evaluates them comparatively */ async evaluateAllScenarios(): Promise<ComparativeEvaluationScore[]> { const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName); const results: ComparativeEvaluationScore[] = []; console.log(`Found ${scenarios.length} scenarios with multiple models for comparative evaluation`); for (const scenario of scenarios) { try { const result = await this.evaluateScenario(scenario); results.push(result); } catch (error) { console.error(`Failed to evaluate scenario ${scenario.interaction_id}:`, error); } } return results; } /** * Conduct final assessment across all scenarios to determine overall winner */ async conductFinalAssessment(scenarioResults: ComparativeEvaluationScore[]): Promise<any> { if (scenarioResults.length === 0) { throw new Error('No scenario results provided for final assessment'); } // Load the overall winner assessment prompt const promptPath = join(process.cwd(), 'src', 'evaluation', 'prompts', 'overall-winner-assessment.md'); const overallWinnerTemplate = readFileSync(promptPath, 'utf8'); // Get all models that should have been tested (from first scenario) const allModels = scenarioResults[0]?.modelRankings?.map(r => r.model) || []; // Build the final assessment prompt with raw data const finalPrompt = overallWinnerTemplate .replace('{tool_type}', this.toolName) .replace('{total_scenarios}', scenarioResults.length.toString()) .replace('{expected_models}', JSON.stringify(allModels)) .replace('{scenario_results}', JSON.stringify(scenarioResults, null, 2)); try { console.log(`\n🔍 Conducting final assessment across ${scenarioResults.length} scenarios for ${this.toolName}\n`); const response = await this.evaluatorModel.sendMessage( finalPrompt, `${this.name}-final-assessment`, { user_intent: `Final cross-scenario assessment for ${this.toolName}`, interaction_id: 'final-assessment' } ); // Extract JSON from AI response const finalAssessment = extractJsonFromAIResponse(response.content); console.log(`✅ Final Assessment Complete for ${this.toolName}`); console.log(`🏆 Overall Winner: ${finalAssessment.overall_assessment?.winner || 'Unknown'}`); return finalAssessment; } catch (error) { console.error(`Final assessment failed for ${this.toolName}:`, error); throw error; } } /** * Evaluate a single scenario comparing all available models */ async evaluateScenario(scenario: ComparisonScenario): Promise<ComparativeEvaluationScore> { // Build model responses section for the prompt const modelResponsesText = scenario.models.map((modelResponse, index) => { // Build failure analysis context let reliabilityContext = '✅ Completed successfully'; if (modelResponse.metadata.failure_analysis) { const failure = modelResponse.metadata.failure_analysis; reliabilityContext = `⚠️ **${failure.failure_type.toUpperCase()} FAILURE**: ${failure.failure_reason}`; if (failure.failure_type === 'timeout') { reliabilityContext += `\n- **Time to failure**: ${Math.round(failure.time_to_failure / 1000)}s (${Math.round(failure.time_to_failure / 60000)}min)`; reliabilityContext += `\n- **Impact**: Model could not complete full workflow within time limit`; } } return `### Model ${index + 1}: ${modelResponse.model} **Performance Metrics:** - Duration: ${modelResponse.performance.duration_ms}ms - Input Tokens: ${modelResponse.performance.input_tokens} - Output Tokens: ${modelResponse.performance.output_tokens} - Total Tokens: ${modelResponse.performance.total_tokens} - Iterations: ${modelResponse.performance.iterations || 'N/A'} - Tool Calls: ${modelResponse.performance.tool_calls_executed || 'N/A'} - Cache Read: ${modelResponse.performance.cache_read_tokens || 0} tokens - Cache Creation: ${modelResponse.performance.cache_creation_tokens || 0} tokens **Reliability Status:** ${reliabilityContext} **Response:** ${modelResponse.response} ---`; }).join('\n\n'); const modelList = scenario.models.map(m => m.model).join('", "'); // Generate the comparative evaluation prompt const evaluationPrompt = this.buildEvaluationPrompt(scenario, modelResponsesText, modelList); try { const response = await this.evaluatorModel.sendMessage( evaluationPrompt, `${this.name}-${scenario.interaction_id}`, { user_intent: `Comparative ${this.name} evaluation for ${scenario.interaction_id}`, interaction_id: scenario.interaction_id } ); // Extract JSON from AI response with robust parsing const evaluation: ComparativeEvaluationResult = extractJsonFromAIResponse(response.content); // Convert to standard EvaluationScore format const rankings = evaluation.ranking || []; const bestModel = rankings.length > 0 ? rankings[0].model : scenario.models[0].model; const bestScore = rankings.length > 0 ? rankings[0].score : 0; return { key: `${this.name}_${scenario.interaction_id}`, score: bestScore, comment: evaluation.overall_insights || 'Comparative evaluation completed', confidence: 0.9, // High confidence for comparative evaluation modelRankings: rankings.map(r => ({ rank: r.rank, model: r.model, score: r.score })), bestModel, modelCount: scenario.models.length }; } catch (error) { console.error(`Comparative evaluation failed for ${scenario.interaction_id}:`, error); return { key: `${this.name}_${scenario.interaction_id}`, score: 0, comment: `Evaluation error: ${error}`, confidence: 0, modelRankings: [], bestModel: 'unknown', modelCount: scenario.models.length }; } } /** * Build the evaluation prompt - can be overridden by subclasses for custom behavior */ protected buildEvaluationPrompt(scenario: ComparisonScenario, modelResponsesText: string, modelList: string): string { // Build metadata context sections const pricingContext = buildModelPricingContext(this.metadata.models); const toolContext = buildToolContext(this.toolName, this.metadata.tools); // Inject all data into prompt template via placeholders return this.promptTemplate .replace('{pricing_context}', pricingContext) .replace('{tool_context}', toolContext) .replace('{issue}', scenario.issue) .replace('{model_responses}', modelResponsesText) .replace('{model_list}', modelList) .replace('{phase}', scenario.interaction_id) .replace('{scenario_name}', scenario.interaction_id); } /** * Get statistics about available datasets */ getDatasetStats() { return this.datasetAnalyzer.getDatasetStats(this.toolName); } /** * Get detailed breakdown of evaluation phases available * Must be implemented by subclasses to provide domain-specific phase descriptions */ abstract getEvaluationPhases(): { phase: string; description: string; availableModels: string[]; scenarioCount: number; }[]; }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/vfarcic/dot-ai'

If you have feedback or need assistance with the MCP directory API, please join our Discord server