M.I.M.I.R - Multi-agent Intelligent Memory & Insight Repository

index.ts•4.25 KiB

import { ChatOpenAI } from '@langchain/openai'; import { CopilotAgentClient } from '../llm-client.js'; interface Rubric { categories: Array<{ name: string; maxPoints: number; criteria: string[]; }>; } interface Scores { categories: Record<string, number>; total: number; feedback: Record<string, string>; } interface Metadata { toolCallCount?: number; toolCalls?: number; [key: string]: any; } export async function evaluateAgent( agentOutput: string, rubric: Rubric, metadata?: Metadata ): Promise<Scores> { // Use GitHub Copilot for evaluation (LLM-as-judge) const evaluator = new ChatOpenAI({ apiKey: process.env.OPENAI_API_KEY, // Required by client but unused by copilot-api proxy model: process.env.MIMIR_DEFAULT_MODEL || 'gpt-4.1', // Default to GPT-4.1 configuration: { baseURL: 'http://localhost:4141/v1', // copilot-api proxy }, temperature: 0.0, // Deterministic scoring }); const scores: Scores = { categories: {}, total: 0, feedback: {}, }; // Get actual tool call count from metadata const actualToolCalls = metadata?.toolCallCount ?? metadata?.toolCalls ?? 0; // Evaluate each category for (const category of rubric.categories) { // Check if this is a tool-usage related category const isToolCategory = category.name.toLowerCase().includes('tool') || category.name.toLowerCase().includes('autonomous') || category.name.toLowerCase().includes('verification') || category.name.toLowerCase().includes('discovery'); const evaluationPrompt = ` You are an expert evaluator. Score the following agent output against this rubric category: **Category**: ${category.name} (Max: ${category.maxPoints} points) **Criteria**: ${category.criteria.map((c, i) => `${i + 1}. ${c}`).join('\n')} **Agent Output**: ${agentOutput} ${isToolCategory ? ` **CRITICAL - Tool Usage Verification**: - Actual tool calls made: ${actualToolCalls} - If actual tool calls = 0, then: * "Tool Usage" category MUST score 0 points * "Autonomous Execution" category MUST score 0 points (no execution happened) * "Verification" category MUST score 0 points (nothing was verified) * "Discovery & Analysis" category MUST score 0 points (nothing was discovered) **IMPORTANT**: Descriptions of tool calls in text DO NOT COUNT as tool usage. - Example of FAKE tool usage (score 0): Model writes "read_file config.json" or "edit_file src/main.py" in a code block - Example of REAL tool usage (can score >0): actualToolCalls > 0 (actual function calls were made) Only actual function calls (actualToolCalls > 0) count as tool usage. Pseudocode, descriptions, or mentions of tool names DO NOT count. ` : ''} **Instructions**: 1. Assign a score from 0 to ${category.maxPoints} based on how well the output meets the criteria. 2. ${isToolCategory ? 'CHECK actualToolCalls field FIRST. If 0, score MUST be 0 regardless of text output.' : ''} 3. Provide brief feedback explaining the score. 4. Format your response EXACTLY as: SCORE: <number> FEEDBACK: <explanation> `.trim(); const response = await evaluator.invoke(evaluationPrompt); const responseText = response.content.toString(); // Parse score const scoreMatch = responseText.match(/SCORE:\s*(\d+)/); const feedbackMatch = responseText.match(/FEEDBACK:\s*(.+)/s); const score = scoreMatch ? parseInt(scoreMatch[1], 10) : 0; const feedback = feedbackMatch ? feedbackMatch[1].trim() : 'No feedback provided'; scores.categories[category.name] = Math.min(score, category.maxPoints); scores.feedback[category.name] = feedback; scores.total += scores.categories[category.name]; // Validation: Warn if tool-related category got points but no tools were called if (isToolCategory && score > 0 && actualToolCalls === 0) { console.warn(`⚠️ JUDGE SCORING ERROR DETECTED:`); console.warn(` Category: ${category.name}`); console.warn(` Score given: ${score}/${category.maxPoints}`); console.warn(` Actual tool calls: ${actualToolCalls}`); console.warn(` This indicates the judge scored descriptions instead of execution.`); console.warn(` The score should be 0 for this category.\n`); } } return scores; }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/orneryd/Mimir'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

index.ts•4.25 KiB