name: Tests & Evaluation
on:
push:
branches: [ main ]
pull_request:
# Run on all pull requests, regardless of target branch
# Add permissions to allow PR comments
permissions:
contents: read
pull-requests: write
actions: read
jobs:
test:
name: Test
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [18.x, 20.x]
steps:
- uses: actions/checkout@v4
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node-version }}
# Setup PNPM - must be before setting up Node.js cache
- name: Setup PNPM
uses: pnpm/action-setup@v2
# Setup Node.js cache after PNPM is installed
- name: Setup Node.js with cache
uses: actions/setup-node@v4
with:
node-version: ${{ matrix.node-version }}
cache: 'pnpm'
- name: Install dependencies
run: pnpm install
- name: Typecheck (entire codebase)
run: pnpm typecheck
- name: Run tests
run: pnpm test
- name: Run test with coverage
run: pnpm test:coverage
- name: Build
run: pnpm build
# New job that runs after all test matrix jobs complete
evaluate:
name: Run Evaluations
# This job will only run if all test jobs succeed
needs: test
runs-on: ubuntu-latest
# Special handling for main branch
if: success()
steps:
- uses: actions/checkout@v4
- name: Setup PNPM
uses: pnpm/action-setup@v2
- name: Setup Node.js
uses: actions/setup-node@v4
with:
node-version: '18'
cache: 'pnpm'
- name: Install dependencies
run: pnpm install
- name: Build project for evaluation
run: pnpm run build
- name: Configure MCP environment
run: echo "Using environment variable-based configuration"
# Verify the build file exists before running evals
- name: Verify build file exists
run: |
mkdir -p eval/reports
if [ ! -f "build/index.mjs" ]; then
echo "ERROR: build/index.mjs does not exist after build step!"
echo '<!DOCTYPE html>' > eval/reports/build-failed.html
echo '<html><head><title>Build Failed</title></head>' >> eval/reports/build-failed.html
echo '<body><h1>Evaluation Failed</h1>' >> eval/reports/build-failed.html
echo '<p>The MCP build output file does not exist. Check the build step for errors.</p>' >> eval/reports/build-failed.html
echo '</body></html>' >> eval/reports/build-failed.html
exit 1
else
echo "Build file found, proceeding with evaluation"
fi
- name: Run evaluations
id: run_evals
run: |
echo "Running evaluations..."
if ! pnpm run eval; then
echo "::error::Evaluation failed during execution"
echo "EVAL_OUTCOME=failed" >> $GITHUB_ENV
# Create a failure report but don't exit yet - we want to collect all artifacts
mkdir -p eval/reports
echo '<!DOCTYPE html>' > eval/reports/eval-failed.html
echo '<html><head><title>Evaluation Failed</title></head>' >> eval/reports/eval-failed.html
echo '<body><h1>Evaluation Failed</h1>' >> eval/reports/eval-failed.html
echo '<p>The evaluation process encountered an error. Check the logs for details.</p>' >> eval/reports/eval-failed.html
echo '<h2>Configuration Information</h2>' >> eval/reports/eval-failed.html
echo '<pre>' >> eval/reports/eval-failed.html
if [ -n "$HONEYCOMB_API_KEY" ]; then
echo "Honeycomb API key is set (length: ${#HONEYCOMB_API_KEY})" >> eval/reports/eval-failed.html
else
echo "Honeycomb API key is not set!" >> eval/reports/eval-failed.html
echo "Make sure HONEYCOMB_API_KEY is set in GitHub secrets and passed to the workflow" >> eval/reports/eval-failed.html
fi
echo '</pre>' >> eval/reports/eval-failed.html
echo '</body></html>' >> eval/reports/eval-failed.html
# Print environment variables (excluding secrets) for debugging
echo "Environment variables for debugging:"
env | grep -v -E "HONEYCOMB_API_KEY|OPENAI_API_KEY|ANTHROPIC_API_KEY" | sort
else
echo "EVAL_OUTCOME=success" >> $GITHUB_ENV
fi
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
# Use Honeycomb API key for environment variable-based config
HONEYCOMB_API_KEY: ${{ secrets.HONEYCOMB_API_KEY }}
# Use only limited models for CI to save costs
EVAL_MODELS: '{"openai":"gpt-4o-mini","anthropic":"claude-3-5-haiku-latest"}'
EVAL_CONCURRENCY: 2
EVAL_JUDGE_PROVIDER: "anthropic"
EVAL_JUDGE_MODEL: "claude-3-5-haiku-latest"
MCP_SERVER_COMMAND: "node build/index.mjs"
- name: Ensure reports directory exists
run: mkdir -p eval/reports
- name: Create index file if no reports are generated
run: |
# Check if any HTML reports exist
if [ -z "$(find eval/reports -name '*.html' 2>/dev/null)" ]; then
echo "No reports were generated, creating a placeholder"
echo '<!DOCTYPE html>' > eval/reports/no-reports.html
echo '<html><head><title>No Reports</title></head>' >> eval/reports/no-reports.html
echo '<body><h1>No evaluation reports generated</h1>' >> eval/reports/no-reports.html
echo '<p>This could be due to missing API keys or configuration.</p>' >> eval/reports/no-reports.html
echo '</body></html>' >> eval/reports/no-reports.html
fi
- name: Find latest report
id: find-report
run: |
LATEST_REPORT=$(ls -t eval/reports/*.html 2>/dev/null | head -1 || echo "eval/reports/no-reports.html")
echo "latest_report=$LATEST_REPORT" >> $GITHUB_OUTPUT
- name: Post report summary
run: |
if [ "$EVAL_OUTCOME" == "failed" ]; then
echo "## ❌ Evaluation Failed" > $GITHUB_STEP_SUMMARY
echo "The evaluation process encountered errors. See logs for details." >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "Error report: $(basename ${{ steps.find-report.outputs.latest_report }})" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "The error report is available as a workflow artifact." >> $GITHUB_STEP_SUMMARY
else
echo "## ✅ Evaluation Results" > $GITHUB_STEP_SUMMARY
echo "Ran evaluations with OpenAI and Anthropic models." >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "### Summary" >> $GITHUB_STEP_SUMMARY
echo "Latest report: $(basename ${{ steps.find-report.outputs.latest_report }})" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
echo "The full report is available as a workflow artifact." >> $GITHUB_STEP_SUMMARY
fi
# Add PR comment if we're on a PR
if [ "${{ github.event_name }}" == "pull_request" ]; then
# Start with basic PR comment header
echo "## Honeycomb MCP Evaluation Results" > pr_comment.txt
echo "" >> pr_comment.txt
if [ "$EVAL_OUTCOME" == "failed" ]; then
echo "❌ Evaluation process failed" >> pr_comment.txt
echo "" >> pr_comment.txt
echo "The evaluation process encountered errors. See workflow logs for details." >> pr_comment.txt
else
# Find the latest summary file
LATEST_SUMMARY=$(find eval/results -name "summary-*.json" -type f | sort -r | head -1)
if [ -n "$LATEST_SUMMARY" ] && [ -f "$LATEST_SUMMARY" ]; then
echo "Found summary file: $LATEST_SUMMARY"
# Extract key metrics
RATE=$(jq -r '.successRate' "$LATEST_SUMMARY" 2>/dev/null || echo "0")
# Calculate percentage with bc (more reliable than jq for math)
SUCCESS_RATE=$(echo "$RATE * 100" | bc -l | awk '{printf "%.1f", $0}')
PASSED=$(jq -r '.passed' "$LATEST_SUMMARY" 2>/dev/null || echo "N/A")
TOTAL=$(jq -r '.totalTests' "$LATEST_SUMMARY" 2>/dev/null || echo "N/A")
# Use bc for reliable floating point comparison
if (( $(echo "$RATE >= 0.75" | bc -l) )); then
echo "✅ Evaluations completed successfully: **${SUCCESS_RATE}%** pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt
elif (( $(echo "$RATE >= 0.5" | bc -l) )); then
echo "⚠️ Evaluations completed with mixed results: **${SUCCESS_RATE}%** pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt
else
echo "❌ Evaluations completed with poor results: **${SUCCESS_RATE}%** pass rate (${PASSED}/${TOTAL} tests)" >> pr_comment.txt
fi
echo "" >> pr_comment.txt
# Basic metrics table
echo "### Evaluation Summary" >> pr_comment.txt
echo "" >> pr_comment.txt
echo "| Metric | Value |" >> pr_comment.txt
echo "|--------|-------|" >> pr_comment.txt
echo "| Success Rate | ${SUCCESS_RATE}% |" >> pr_comment.txt
echo "| Tests Passed | $PASSED / $TOTAL |" >> pr_comment.txt
# Add latency if available
AVG_LATENCY=$(jq -r '.averageLatency' "$LATEST_SUMMARY" 2>/dev/null || echo "N/A")
if [ "$AVG_LATENCY" != "N/A" ] && [ "$AVG_LATENCY" != "null" ]; then
AVG_LATENCY_INT=$(echo "$AVG_LATENCY" | awk '{printf "%.0f", $0}')
echo "| Avg Latency | ${AVG_LATENCY_INT}ms |" >> pr_comment.txt
fi
# Add basic model information
echo "" >> pr_comment.txt
echo "### Models Tested" >> pr_comment.txt
echo "" >> pr_comment.txt
# Extract providers directly
echo "| Provider | Model |" >> pr_comment.txt
echo "|----------|-------|" >> pr_comment.txt
# OpenAI models
OPENAI_MODELS=$(jq -r '.results[] | select(.provider == "openai") | .model' "$LATEST_SUMMARY" 2>/dev/null | sort -u)
if [ -n "$OPENAI_MODELS" ]; then
while read -r model; do
if [ -n "$model" ]; then
echo "| OpenAI | $model |" >> pr_comment.txt
fi
done <<< "$OPENAI_MODELS"
fi
# Anthropic models
ANTHROPIC_MODELS=$(jq -r '.results[] | select(.provider == "anthropic") | .model' "$LATEST_SUMMARY" 2>/dev/null | sort -u)
if [ -n "$ANTHROPIC_MODELS" ]; then
while read -r model; do
if [ -n "$model" ]; then
echo "| Anthropic | $model |" >> pr_comment.txt
fi
done <<< "$ANTHROPIC_MODELS"
fi
else
echo "✅ Evaluations completed successfully" >> pr_comment.txt
echo "" >> pr_comment.txt
echo "No detailed metrics available" >> pr_comment.txt
fi
# Always add a link to the artifacts
echo "" >> pr_comment.txt
echo "📊 [View full report in workflow artifacts](https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }})" >> pr_comment.txt
fi
# Post the comment to the PR
gh pr comment ${{ github.event.pull_request.number }} --body-file pr_comment.txt
fi
env:
GH_TOKEN: ${{ github.token }}
# Create report index if it doesn't exist
- name: Generate report index if needed
run: |
if [ ! -f "eval/reports/index.html" ]; then
echo "Generating index.html for reports using the update-index script"
pnpm run eval:update-index
fi
# Upload evaluation reports as artifacts
- name: Upload evaluation reports
uses: actions/upload-artifact@v4
with:
name: evaluation-reports
path: eval/reports/
retention-days: 30
# Final step to fail the job if evaluations failed
- name: Check final evaluation status
if: env.EVAL_OUTCOME == 'failed'
run: |
echo "::error::Evaluation failed - see artifacts for error report"
exit 1