eval.yml•7.33 kB
name: Eval
on:
workflow_dispatch:
push:
branches: [main]
paths:
- "packages/mcp-core/src/tools*"
- "packages/mcp-server-evals/**"
- "packages/mcp-server-mocks/**"
- ".github/workflows/eval.yml"
pull_request:
paths:
- "packages/mcp-core/src/tools*"
- "packages/mcp-server-evals/**"
- "packages/mcp-server-mocks/**"
- ".github/workflows/eval.yml"
jobs:
eval:
environment: Actions
runs-on: ubuntu-latest
permissions:
checks: write
contents: read
steps:
- uses: actions/checkout@v4
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: "20"
# pnpm/action-setup@v4
- uses: pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda
name: Install pnpm
with:
run_install: false
- name: Get pnpm store directory
shell: bash
run: |
echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_ENV
- uses: actions/cache@v4
name: Setup pnpm cache
with:
path: ${{ env.STORE_PATH }}
key: ${{ runner.os }}-pnpm-store-${{ hashFiles('**/pnpm-lock.yaml') }}
restore-keys: |
${{ runner.os }}-pnpm-store-
- name: Install dependencies
run: pnpm install --no-frozen-lockfile
- name: Run build
run: pnpm build
- name: Run evals
run: pnpm eval:ci evals
continue-on-error: true
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
- name: Create eval status check
uses: actions/github-script@v7
if: ${{ !cancelled() }}
with:
script: |
const fs = require('fs');
const path = require('path');
// Read eval results
const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json');
console.log(`Reading eval results from: ${resultsPath}`);
let vitestResults;
try {
vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));
} catch (error) {
if (error.code === 'ENOENT') {
throw new Error(
`Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.`
);
}
throw new Error(`Failed to read/parse eval results: ${error.message}`);
}
// Extract eval results from vitest format
const evalResults = [];
for (const testFile of vitestResults.testResults || []) {
for (const test of testFile.assertionResults || []) {
if (test.meta?.eval) {
evalResults.push({
name: test.fullName || test.title,
file: testFile.name,
avgScore: test.meta.eval.avgScore ?? null,
scores: test.meta.eval.scores || [],
passed: test.status === 'passed',
duration: test.duration,
});
}
}
}
// Calculate statistics
const totalTests = evalResults.length;
// Treat null scores as 0.0 for consistent categorization
const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0);
const avgScore = scores.length > 0
? scores.reduce((sum, score) => sum + score, 0) / scores.length
: 0;
const green = scores.filter(s => s >= 0.75).length;
const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length;
const red = scores.filter(s => s < 0.5).length;
// Determine conclusion
const conclusion = avgScore >= 0.5 ? 'success' : 'failure';
// Format score helper
function formatScore(score) {
if (score >= 0.75) return `🟢 ${score.toFixed(2)}`;
if (score >= 0.5) return `🟡 ${score.toFixed(2)}`;
return `🔴 ${score.toFixed(2)}`;
}
// Build title
const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`;
// Build summary
const summary = [
`## Overall Statistics`,
``,
`- **Total Evaluations**: ${totalTests}`,
`- **Average Score**: ${formatScore(avgScore)}`,
`- **Pass Threshold**: 0.50 (catastrophic failure)`,
``,
`### Score Distribution`,
`- 🟢 Green (≥0.75): ${green} evals`,
`- 🟡 Yellow (0.50-0.74): ${yellow} evals`,
`- 🔴 Red (<0.50): ${red} evals`,
].join('\n');
// Build detailed results
const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0));
const details = [
`## Individual Eval Scores`,
``,
...detailsByScore.map(result => {
const score = result.avgScore !== null ? result.avgScore : 0;
const statusIcon = result.passed ? '✅' : '❌';
const scoreDisplay = formatScore(score);
let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`;
// Add rationale for failed or low-scoring tests
if (!result.passed || score < 0.75) {
const firstScore = result.scores[0];
if (firstScore?.metadata?.rationale) {
line += `\n - ${firstScore.metadata.rationale}`;
}
}
return line;
}),
``,
`---`,
``,
`### Conclusion`,
``,
conclusion === 'success'
? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)`
: `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`,
].join('\n');
// Create check run
await github.rest.checks.create({
owner: context.repo.owner,
repo: context.repo.repo,
name: 'Evaluation Results',
head_sha: context.sha,
status: 'completed',
conclusion: conclusion,
output: {
title: title,
summary: summary,
text: details,
},
});
console.log(`✅ Check run created with conclusion: ${conclusion}`);
console.log(` Average Score: ${avgScore.toFixed(2)}`);
- name: Upload coverage reports to Codecov
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
with:
flags: evals
name: codecov-evals
fail_ci_if_error: false
- name: Upload results to Codecov
if: ${{ !cancelled() }}
uses: codecov/test-results-action@v1
with:
token: ${{ secrets.CODECOV_TOKEN }}