Skip to main content
Glama
process-results.js6.34 kB
#!/usr/bin/env node /** * Process MCP evaluation results and generate summary */ import fs from "fs"; import path from "path"; import { fileURLToPath } from "url"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); const RESULTS_DIR = path.join(__dirname, "..", "results"); const RESULTS_FILE = path.join(RESULTS_DIR, "evaluation-results.json"); const SUMMARY_FILE = path.join(RESULTS_DIR, "evaluation-summary.json"); function processResults() { try { // Ensure results directory exists if (!fs.existsSync(RESULTS_DIR)) { fs.mkdirSync(RESULTS_DIR, { recursive: true }); } // Check if results file exists if (!fs.existsSync(RESULTS_FILE)) { console.error("❌ No evaluation results found at:", RESULTS_FILE); // Create a dummy summary for CI const dummySummary = { overall_score: 0, tests_passed: 0, total_tests: 0, status: "no_results", message: "No evaluation results found", }; fs.writeFileSync(SUMMARY_FILE, JSON.stringify(dummySummary, null, 2)); process.exit(1); } // Read evaluation results const results = JSON.parse(fs.readFileSync(RESULTS_FILE, "utf8")); // Process results const summary = { timestamp: new Date().toISOString(), overall_score: 0, tests_passed: 0, total_tests: 0, categories: {}, failed_tests: [], top_performing_tests: [], recommendations: [], }; // Handle different result formats let evaluations = []; if (Array.isArray(results)) { evaluations = results; } else if (results.evaluations) { evaluations = results.evaluations; } else if (results.results) { evaluations = results.results; } else { console.error("❌ Unexpected results format"); process.exit(1); } // Process each evaluation let totalScore = 0; let scoreCount = 0; evaluations.forEach((evaluation) => { summary.total_tests++; // Calculate average score for this evaluation const scores = [ evaluation.accuracy || 0, evaluation.completeness || 0, evaluation.relevance || 0, evaluation.clarity || 0, evaluation.reasoning || 0, ]; const avgScore = scores.reduce((a, b) => a + b, 0) / scores.length; totalScore += avgScore; scoreCount++; // Determine if test passed (threshold 3.5) const passed = avgScore >= 3.5; if (passed) { summary.tests_passed++; } else { summary.failed_tests.push({ name: evaluation.name, score: avgScore, reason: evaluation.overall_comments || "No specific reason provided", }); } // Track top performing tests if (avgScore >= 4.5) { summary.top_performing_tests.push({ name: evaluation.name, score: avgScore, }); } // Categorize by tool type const category = evaluation.name.split("_")[0] || "general"; if (!summary.categories[category]) { summary.categories[category] = { total: 0, passed: 0, avg_score: 0, scores: [], }; } summary.categories[category].total++; summary.categories[category].scores.push(avgScore); if (passed) { summary.categories[category].passed++; } }); // Calculate overall score summary.overall_score = scoreCount > 0 ? (totalScore / scoreCount).toFixed(2) : 0; // Calculate category averages Object.keys(summary.categories).forEach((category) => { const cat = summary.categories[category]; cat.avg_score = cat.scores.reduce((a, b) => a + b, 0) / cat.scores.length; cat.avg_score = cat.avg_score.toFixed(2); delete cat.scores; // Remove raw scores from summary }); // Generate recommendations if (summary.failed_tests.length > 0) { summary.recommendations.push("Review and fix failed tests to improve overall score"); } if (summary.overall_score < 4.0) { summary.recommendations.push("Consider improving tool reliability and error handling"); } if (summary.categories.error && summary.categories.error.passed / summary.categories.error.total < 0.8) { summary.recommendations.push("Improve error handling and edge case management"); } // Determine status let status = "failed"; if (summary.overall_score >= 4.5) { status = "excellent"; } else if (summary.overall_score >= 4.0) { status = "good"; } else if (summary.overall_score >= 3.5) { status = "passed"; } summary.status = status; // Write summary fs.writeFileSync(SUMMARY_FILE, JSON.stringify(summary, null, 2)); // Output results console.log("📊 Evaluation Summary:"); console.log(`Overall Score: ${summary.overall_score}/5.0`); console.log(`Tests Passed: ${summary.tests_passed}/${summary.total_tests}`); console.log(`Status: ${status.toUpperCase()}`); if (summary.failed_tests.length > 0) { console.log("\n❌ Failed Tests:"); summary.failed_tests.forEach((test) => { console.log(` - ${test.name}: ${test.score.toFixed(2)}/5.0`); }); } if (summary.top_performing_tests.length > 0) { console.log("\n✅ Top Performing Tests:"); summary.top_performing_tests.forEach((test) => { console.log(` - ${test.name}: ${test.score.toFixed(2)}/5.0`); }); } console.log("\n📈 Category Breakdown:"); Object.entries(summary.categories).forEach(([category, data]) => { console.log(` ${category}: ${data.passed}/${data.total} passed (avg: ${data.avg_score}/5.0)`); }); if (summary.recommendations.length > 0) { console.log("\n💡 Recommendations:"); summary.recommendations.forEach((rec) => { console.log(` - ${rec}`); }); } console.log(`\n✅ Summary saved to: ${SUMMARY_FILE}`); // Exit with appropriate code process.exit(status === "failed" ? 1 : 0); } catch (error) { console.error("❌ Error processing results:", error.message); process.exit(1); } } // Run if called directly if (import.meta.url === `file://${process.argv[1]}`) { processResults(); } export { processResults };

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/docdyhr/mcp-wordpress'

If you have feedback or need assistance with the MCP directory API, please join our Discord server