Skip to main content
Glama
eval.yml7.33 kB
name: Eval on: workflow_dispatch: push: branches: [main] paths: - "packages/mcp-core/src/tools*" - "packages/mcp-server-evals/**" - "packages/mcp-server-mocks/**" - ".github/workflows/eval.yml" pull_request: paths: - "packages/mcp-core/src/tools*" - "packages/mcp-server-evals/**" - "packages/mcp-server-mocks/**" - ".github/workflows/eval.yml" jobs: eval: environment: Actions runs-on: ubuntu-latest permissions: checks: write contents: read steps: - uses: actions/checkout@v4 - name: Setup Node.js uses: actions/setup-node@v4 with: node-version: "20" # pnpm/action-setup@v4 - uses: pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda name: Install pnpm with: run_install: false - name: Get pnpm store directory shell: bash run: | echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_ENV - uses: actions/cache@v4 name: Setup pnpm cache with: path: ${{ env.STORE_PATH }} key: ${{ runner.os }}-pnpm-store-${{ hashFiles('**/pnpm-lock.yaml') }} restore-keys: | ${{ runner.os }}-pnpm-store- - name: Install dependencies run: pnpm install --no-frozen-lockfile - name: Run build run: pnpm build - name: Run evals run: pnpm eval:ci evals continue-on-error: true env: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - name: Create eval status check uses: actions/github-script@v7 if: ${{ !cancelled() }} with: script: | const fs = require('fs'); const path = require('path'); // Read eval results const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json'); console.log(`Reading eval results from: ${resultsPath}`); let vitestResults; try { vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8')); } catch (error) { if (error.code === 'ENOENT') { throw new Error( `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.` ); } throw new Error(`Failed to read/parse eval results: ${error.message}`); } // Extract eval results from vitest format const evalResults = []; for (const testFile of vitestResults.testResults || []) { for (const test of testFile.assertionResults || []) { if (test.meta?.eval) { evalResults.push({ name: test.fullName || test.title, file: testFile.name, avgScore: test.meta.eval.avgScore ?? null, scores: test.meta.eval.scores || [], passed: test.status === 'passed', duration: test.duration, }); } } } // Calculate statistics const totalTests = evalResults.length; // Treat null scores as 0.0 for consistent categorization const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0); const avgScore = scores.length > 0 ? scores.reduce((sum, score) => sum + score, 0) / scores.length : 0; const green = scores.filter(s => s >= 0.75).length; const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length; const red = scores.filter(s => s < 0.5).length; // Determine conclusion const conclusion = avgScore >= 0.5 ? 'success' : 'failure'; // Format score helper function formatScore(score) { if (score >= 0.75) return `🟢 ${score.toFixed(2)}`; if (score >= 0.5) return `🟡 ${score.toFixed(2)}`; return `🔴 ${score.toFixed(2)}`; } // Build title const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`; // Build summary const summary = [ `## Overall Statistics`, ``, `- **Total Evaluations**: ${totalTests}`, `- **Average Score**: ${formatScore(avgScore)}`, `- **Pass Threshold**: 0.50 (catastrophic failure)`, ``, `### Score Distribution`, `- 🟢 Green (≥0.75): ${green} evals`, `- 🟡 Yellow (0.50-0.74): ${yellow} evals`, `- 🔴 Red (<0.50): ${red} evals`, ].join('\n'); // Build detailed results const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0)); const details = [ `## Individual Eval Scores`, ``, ...detailsByScore.map(result => { const score = result.avgScore !== null ? result.avgScore : 0; const statusIcon = result.passed ? '✅' : '❌'; const scoreDisplay = formatScore(score); let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`; // Add rationale for failed or low-scoring tests if (!result.passed || score < 0.75) { const firstScore = result.scores[0]; if (firstScore?.metadata?.rationale) { line += `\n - ${firstScore.metadata.rationale}`; } } return line; }), ``, `---`, ``, `### Conclusion`, ``, conclusion === 'success' ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)` : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`, ].join('\n'); // Create check run await github.rest.checks.create({ owner: context.repo.owner, repo: context.repo.repo, name: 'Evaluation Results', head_sha: context.sha, status: 'completed', conclusion: conclusion, output: { title: title, summary: summary, text: details, }, }); console.log(`✅ Check run created with conclusion: ${conclusion}`); console.log(` Average Score: ${avgScore.toFixed(2)}`); - name: Upload coverage reports to Codecov uses: codecov/codecov-action@v4 env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} with: flags: evals name: codecov-evals fail_ci_if_error: false - name: Upload results to Codecov if: ${{ !cancelled() }} uses: codecov/test-results-action@v1 with: token: ${{ secrets.CODECOV_TOKEN }}

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/getsentry/sentry-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server