name: Evaluation Quality Gate
on:
pull_request:
types: [opened, synchronize, reopened]
branches: [main]
paths:
- "simplenote_mcp/**"
- "tests/**"
- "evals/**"
- "pyproject.toml"
- "package.json"
workflow_dispatch:
inputs:
pr_number:
description: "PR number to evaluate (optional)"
required: false
type: number
evaluation_suite:
description: "Evaluation suite to run"
required: false
default: "basic"
type: choice
options:
- smoke
- basic
- comprehensive
fail_threshold:
description: "Minimum pass rate to fail (0-100)"
required: false
default: 70
type: number
env:
NODE_VERSION: '18'
PYTHON_VERSION: '3.12'
permissions:
contents: read
pull-requests: write
checks: write
jobs:
run-evaluations:
name: Run MCP Evaluations
runs-on: ubuntu-latest
timeout-minutes: 15
outputs:
evaluation-results: ${{ steps.eval-results.outputs.results }}
pass-rate: ${{ steps.eval-results.outputs.pass_rate }}
quality-gate-passed: ${{ steps.eval-results.outputs.quality_gate_passed }}
steps:
- name: Checkout code
uses: actions/checkout@v6
- name: Set up Python
uses: actions/setup-python@v6
with:
python-version: ${{ env.PYTHON_VERSION }}
cache: 'pip'
- name: Set up Node.js
uses: actions/setup-node@v6
with:
node-version: ${{ env.NODE_VERSION }}
cache: 'npm'
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -e ".[dev,test,all]"
- name: Install Node.js dependencies
run: |
npm ci
npm run validate:evals
- name: Determine evaluation suite
id: eval-suite
run: |
SUITE="${{ github.event.inputs.evaluation_suite || 'basic' }}"
echo "suite=${SUITE}" >> $GITHUB_OUTPUT
case "${SUITE}" in
smoke)
echo "timeout=5" >> $GITHUB_OUTPUT
echo "description=Quick smoke tests" >> $GITHUB_OUTPUT
;;
basic)
echo "timeout=10" >> $GITHUB_OUTPUT
echo "description=Standard evaluation suite" >> $GITHUB_OUTPUT
;;
comprehensive)
echo "timeout=20" >> $GITHUB_OUTPUT
echo "description=Full comprehensive evaluation" >> $GITHUB_OUTPUT
;;
esac
- name: Configure test environment
run: |
# Set up offline mode for evaluations
echo "SIMPLENOTE_OFFLINE_MODE=true" >> $GITHUB_ENV
echo "SIMPLENOTE_EMAIL=test@example.com" >> $GITHUB_ENV
echo "SIMPLENOTE_PASSWORD=test_password" >> $GITHUB_ENV
# Create test data directory
mkdir -p test-data/
echo '{"test": "data"}' > test-data/sample.json
- name: Run MCP evaluations
id: run-evals
timeout-minutes: ${{ steps.eval-suite.outputs.timeout }}
run: |
echo "Running ${{ steps.eval-suite.outputs.description }}..."
# Run evaluations and capture results
case "${{ steps.eval-suite.outputs.suite }}" in
smoke)
npm run eval:smoke > evaluation-output.txt 2>&1
;;
basic)
npm run eval:basic > evaluation-output.txt 2>&1
;;
comprehensive)
timeout 1200s npm run eval:comprehensive > evaluation-output.txt 2>&1
;;
esac
EVAL_EXIT_CODE=$?
echo "exit_code=${EVAL_EXIT_CODE}" >> $GITHUB_OUTPUT
# Display results
echo "=== Evaluation Output ==="
cat evaluation-output.txt
echo "========================="
# Save output for later steps
cp evaluation-output.txt eval-results.txt
- name: Parse evaluation results
id: eval-results
run: |
python - <<'EOF'
import json
import re
import sys
import os
from pathlib import Path
def parse_evaluation_results(output_file):
"""Parse MCP evaluation results from output."""
try:
with open(output_file, 'r') as f:
content = f.read()
except FileNotFoundError:
print("No evaluation output found")
return None
# Initialize results
results = {
"total_tests": 0,
"passed_tests": 0,
"failed_tests": 0,
"pass_rate": 0.0,
"status": "unknown",
"details": [],
"error_summary": []
}
# Try to parse different output formats
lines = content.split('\n')
for line in lines:
line = line.strip()
# Look for test summary patterns
if "passed" in line.lower() and "failed" in line.lower():
# Pattern: "X passed, Y failed"
match = re.search(r'(\d+)\s+passed.*?(\d+)\s+failed', line, re.IGNORECASE)
if match:
results["passed_tests"] = int(match.group(1))
results["failed_tests"] = int(match.group(2))
results["total_tests"] = results["passed_tests"] + results["failed_tests"]
# Look for pass rate patterns
elif "%" in line and ("pass" in line.lower() or "success" in line.lower()):
match = re.search(r'(\d+(?:\.\d+)?)%', line)
if match:
results["pass_rate"] = float(match.group(1))
# Collect error information
elif "error" in line.lower() or "fail" in line.lower():
if len(line) > 10: # Avoid short generic lines
results["error_summary"].append(line)
# Look for evaluation-specific patterns
elif "evaluation" in line.lower() and ("complete" in line.lower() or "finished" in line.lower()):
results["details"].append(line)
# Calculate pass rate if not found
if results["total_tests"] > 0 and results["pass_rate"] == 0.0:
results["pass_rate"] = (results["passed_tests"] / results["total_tests"]) * 100
# Determine status
if results["total_tests"] == 0:
results["status"] = "no_tests"
elif results["failed_tests"] == 0:
results["status"] = "success"
elif results["pass_rate"] >= 90:
results["status"] = "good"
elif results["pass_rate"] >= 70:
results["status"] = "acceptable"
else:
results["status"] = "poor"
return results
# Parse results
results = parse_evaluation_results('eval-results.txt')
if results:
# Set outputs
pass_rate = results["pass_rate"]
total_tests = results["total_tests"]
passed_tests = results["passed_tests"]
failed_tests = results["failed_tests"]
fail_threshold = float(os.environ.get('INPUT_FAIL_THRESHOLD', '70'))
quality_gate_passed = pass_rate >= fail_threshold
print(f"Evaluation Results:")
print(f" Total Tests: {total_tests}")
print(f" Passed: {passed_tests}")
print(f" Failed: {failed_tests}")
print(f" Pass Rate: {pass_rate:.1f}%")
print(f" Threshold: {fail_threshold}%")
print(f" Quality Gate: {'PASSED' if quality_gate_passed else 'FAILED'}")
# Write outputs
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
f.write(f"pass_rate={pass_rate:.1f}\n")
f.write(f"total_tests={total_tests}\n")
f.write(f"passed_tests={passed_tests}\n")
f.write(f"failed_tests={failed_tests}\n")
f.write(f"quality_gate_passed={str(quality_gate_passed).lower()}\n")
f.write(f"status={results['status']}\n")
# Write JSON results for artifact
with open('evaluation-summary.json', 'w') as f:
json.dump({
**results,
"threshold": fail_threshold,
"quality_gate_passed": quality_gate_passed
}, f, indent=2)
else:
print("Failed to parse evaluation results")
# Set default outputs
with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
f.write("pass_rate=0\n")
f.write("total_tests=0\n")
f.write("passed_tests=0\n")
f.write("failed_tests=0\n")
f.write("quality_gate_passed=false\n")
f.write("status=error\n")
EOF
env:
INPUT_FAIL_THRESHOLD: ${{ github.event.inputs.fail_threshold || '70' }}
- name: Upload evaluation artifacts
if: always()
uses: actions/upload-artifact@v6
with:
name: evaluation-results-${{ steps.eval-suite.outputs.suite }}
path: |
evaluation-output.txt
eval-results.txt
evaluation-summary.json
retention-days: 14
quality-gate-assessment:
name: Quality Gate Assessment
runs-on: ubuntu-latest
needs: run-evaluations
if: always()
permissions:
pull-requests: write
checks: write
steps:
- name: Create quality gate report
id: report
run: |
PASS_RATE="${{ needs.run-evaluations.outputs.pass-rate || '0' }}"
QUALITY_GATE_PASSED="${{ needs.run-evaluations.outputs.quality-gate-passed || 'false' }}"
STATUS="${{ needs.run-evaluations.outputs.status || 'unknown' }}"
echo "Creating quality gate report..."
# Determine overall result
if [ "${{ needs.run-evaluations.result }}" = "success" ] && [ "${QUALITY_GATE_PASSED}" = "true" ]; then
OVERALL_STATUS="✅ PASSED"
GATE_EMOJI="🟢"
elif [ "${{ needs.run-evaluations.result }}" = "success" ] && [ "${QUALITY_GATE_PASSED}" = "false" ]; then
OVERALL_STATUS="⚠️ PASSED (Below Threshold)"
GATE_EMOJI="🟡"
else
OVERALL_STATUS="❌ FAILED"
GATE_EMOJI="🔴"
fi
# Create report
cat > quality-gate-report.md << EOF
# ${GATE_EMOJI} Evaluation Quality Gate Report
## Overall Result: ${OVERALL_STATUS}
**Pass Rate**: ${PASS_RATE}%
**Threshold**: ${{ github.event.inputs.fail_threshold || '70' }}%
**Evaluation Suite**: ${{ github.event.inputs.evaluation_suite || 'basic' }}
**Quality Gate**: ${QUALITY_GATE_PASSED}
## Assessment
EOF
# Add assessment based on status
case "${STATUS}" in
success)
echo "🎉 **Excellent!** All evaluations passed successfully." >> quality-gate-report.md
echo "" >> quality-gate-report.md
echo "The code changes maintain high quality standards and functionality." >> quality-gate-report.md
;;
good)
echo "✅ **Good!** Evaluations passed with high success rate." >> quality-gate-report.md
echo "" >> quality-gate-report.md
echo "The code changes meet quality standards with minor issues." >> quality-gate-report.md
;;
acceptable)
echo "⚠️ **Acceptable** but could be improved." >> quality-gate-report.md
echo "" >> quality-gate-report.md
echo "The code changes meet minimum quality requirements." >> quality-gate-report.md
echo "Consider reviewing failed tests and improving implementation." >> quality-gate-report.md
;;
poor)
echo "🔴 **Poor performance** - significant issues detected." >> quality-gate-report.md
echo "" >> quality-gate-report.md
echo "The code changes have quality issues that should be addressed." >> quality-gate-report.md
echo "Review failed evaluations and improve implementation before merging." >> quality-gate-report.md
;;
*)
echo "❓ **Unknown status** - evaluation results unclear." >> quality-gate-report.md
echo "" >> quality-gate-report.md
echo "Please review the evaluation logs for more information." >> quality-gate-report.md
;;
esac
echo "" >> quality-gate-report.md
echo "## Recommendations" >> quality-gate-report.md
echo "" >> quality-gate-report.md
if [ "${QUALITY_GATE_PASSED}" = "false" ]; then
echo "- 🔍 **Review failed evaluations** in the uploaded artifacts" >> quality-gate-report.md
echo "- 🛠️ **Fix failing test scenarios** to improve pass rate" >> quality-gate-report.md
echo "- 📋 **Update implementation** based on evaluation feedback" >> quality-gate-report.md
echo "- ✅ **Re-run evaluations** after making improvements" >> quality-gate-report.md
else
echo "- ✅ **Quality gate passed** - no immediate action required" >> quality-gate-report.md
echo "- 📊 **Monitor evaluation trends** over time" >> quality-gate-report.md
echo "- 🎯 **Consider running comprehensive evaluations** for important changes" >> quality-gate-report.md
fi
echo "" >> quality-gate-report.md
echo "---" >> quality-gate-report.md
echo "*Quality gate is currently non-blocking. This assessment is for informational purposes.*" >> quality-gate-report.md
echo "" >> quality-gate-report.md
echo "📁 **Evaluation artifacts**: Check the Actions tab for detailed results" >> quality-gate-report.md
# Output for next steps
echo "overall_status=${OVERALL_STATUS}" >> $GITHUB_OUTPUT
echo "report_created=true" >> $GITHUB_OUTPUT
cat quality-gate-report.md
- name: Comment on PR
if: github.event_name == 'pull_request' && steps.report.outputs.report_created == 'true'
uses: actions/github-script@v8
with:
script: |
const fs = require('fs');
try {
const report = fs.readFileSync('quality-gate-report.md', 'utf8');
// Check if we already have a quality gate comment
const comments = await github.rest.issues.listComments({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
});
const existingComment = comments.data.find(
comment => comment.body.includes('Evaluation Quality Gate Report')
);
if (existingComment) {
// Update existing comment
await github.rest.issues.updateComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: existingComment.id,
body: report
});
console.log('Updated existing quality gate comment');
} else {
// Create new comment
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: context.issue.number,
body: report
});
console.log('Created new quality gate comment');
}
} catch (error) {
console.error('Failed to post quality gate comment:', error);
core.setFailed(`Failed to post comment: ${error.message}`);
}
- name: Create check run
if: always()
uses: actions/github-script@v8
with:
script: |
const passRate = '${{ needs.run-evaluations.outputs.pass-rate || '0' }}';
const qualityGatePassed = '${{ needs.run-evaluations.outputs.quality-gate-passed }}' === 'true';
const evalResult = '${{ needs.run-evaluations.result }}';
let conclusion, title, summary;
if (evalResult === 'success' && qualityGatePassed) {
conclusion = 'success';
title = '✅ Quality Gate Passed';
summary = `Evaluation pass rate: ${passRate}% (meets threshold)`;
} else if (evalResult === 'success') {
conclusion = 'neutral'; // Non-blocking
title = '⚠️ Quality Gate Below Threshold';
summary = `Evaluation pass rate: ${passRate}% (below threshold but non-blocking)`;
} else {
conclusion = 'neutral'; // Non-blocking
title = '❌ Evaluation Failed';
summary = `Evaluations failed to complete successfully (non-blocking)`;
}
await github.rest.checks.create({
owner: context.repo.owner,
repo: context.repo.repo,
name: 'Evaluation Quality Gate',
head_sha: context.sha,
conclusion: conclusion,
output: {
title: title,
summary: summary,
text: `
This is a non-blocking quality gate based on MCP evaluation results.
**Pass Rate**: ${passRate}%
**Threshold**: ${{ github.event.inputs.fail_threshold || '70' }}%
**Suite**: ${{ github.event.inputs.evaluation_suite || 'basic' }}
Check the evaluation artifacts for detailed results.
`
}
});
- name: Upload quality gate report
if: always()
uses: actions/upload-artifact@v6
with:
name: quality-gate-report
path: quality-gate-report.md
retention-days: 30