Elenchus MCP Server

Overview Schema Related Servers Score Discussions

llm-eval-tools.ts•10.4 KiB

/** * LLM Evaluation Tools * * Provides tools for LLM-based evaluation of verification quality. * These tools return evaluation prompts for the client to process. */ import { z } from 'zod'; import { getSession } from '../state/session.js'; import { buildEvaluatorContext } from '../llm-evaluators/integration.js'; // ============================================================================= // Schemas // ============================================================================= const EvaluateConvergenceSchema = z.object({ sessionId: z.string().describe('Session ID to evaluate'), }); const EvaluateSeveritySchema = z.object({ sessionId: z.string().describe('Session ID'), issueId: z.string().describe('Issue ID to evaluate'), codeContext: z.string().optional().describe('Additional code context for evaluation'), }); const EvaluateEdgeCasesSchema = z.object({ sessionId: z.string().describe('Session ID to evaluate'), }); const SubmitLLMEvaluationSchema = z.object({ sessionId: z.string().describe('Session ID'), evaluationType: z.enum(['convergence', 'severity', 'edgeCases', 'falsePositive']).describe('Type of evaluation'), llmResponse: z.string().describe('LLM response to the evaluation prompt'), targetId: z.string().optional().describe('Target ID (issue ID for severity/falsePositive evaluations)'), }); // ============================================================================= // Prompts // ============================================================================= const CONVERGENCE_SYSTEM_PROMPT = `You are an expert code verification quality assessor. Your task is to evaluate whether a verification session has adequately examined the codebase. Consider: 1. CONTEXT SENSITIVITY: Different code types need different scrutiny levels 2. QUALITY OVER QUANTITY: Focus on verification DEPTH, not just coverage keywords 3. ISSUE PATTERNS: Are unresolved issues truly blockers? 4. VERIFICATION COMPLETENESS: Were all critical paths examined? Respond with a JSON object containing your evaluation.`; const CONVERGENCE_PROMPT_TEMPLATE = `Evaluate this verification session for convergence readiness: ## Requirements {requirements} ## Target Files {targetFiles} ## Current Round Round {currentRound} ## All Verification Outputs {allOutputs} ## Issues Found {issuesSummary} --- Respond with ONLY a valid JSON object: { "passed": boolean, "confidence": "high" | "medium" | "low" | "uncertain", "qualityScore": number (0-100), "categoryScores": { "SECURITY": number, "CORRECTNESS": number, "RELIABILITY": number, "MAINTAINABILITY": number, "PERFORMANCE": number }, "gaps": ["gap 1", "gap 2"], "moreRoundsRecommended": boolean, "reasoning": "detailed explanation" }`; const SEVERITY_PROMPT_TEMPLATE = `Assess the severity of this code issue: ## Issue ID: {issueId} Category: {category} Current Severity: {severity} Description: {description} ## Code Context {codeContext} --- Respond with ONLY a valid JSON object: { "severity": "CRITICAL" | "HIGH" | "MEDIUM" | "LOW", "confidence": "high" | "medium" | "low", "impact": { "exploitability": "easy" | "moderate" | "difficult" | "theoretical", "scope": "widespread" | "limited" | "isolated", "businessImpact": "critical" | "significant" | "moderate" | "minimal" }, "adjustment": { "direction": "escalate" | "downgrade" | "keep", "reason": "explanation" }, "reasoning": "detailed explanation" }`; const EDGE_CASE_PROMPT_TEMPLATE = `Evaluate the edge case coverage in this verification: ## Target Files {targetFiles} ## Verification Outputs {verificationOutputs} --- Analyze whether edge cases were GENUINELY analyzed (not just mentioned). Respond with ONLY a valid JSON object: { "passed": boolean, "confidence": "high" | "medium" | "low", "coverageScore": number (0-100), "analyzedCases": [{ "description": "case", "category": "type", "adequatelyHandled": boolean }], "missingCases": [{ "description": "case", "category": "type", "importance": "critical" | "important" | "nice-to-have" }], "reasoning": "detailed explanation" }`; // ============================================================================= // Handlers // ============================================================================= async function evaluateConvergence(args: z.infer<typeof EvaluateConvergenceSchema>) { const session = await getSession(args.sessionId); if (!session) { return { error: 'Session not found' }; } const context = buildEvaluatorContext(session); const issuesSummary = session.issues.map(i => `[${i.id}] ${i.severity} ${i.category} (${i.status}): ${i.description}` ).join('\n') || 'No issues found'; const prompt = CONVERGENCE_PROMPT_TEMPLATE .replace('{requirements}', context.requirements) .replace('{targetFiles}', context.targetFiles.slice(0, 20).join('\n')) .replace('{currentRound}', String(context.currentRound)) .replace('{allOutputs}', truncate(context.allOutputs, 8000)) .replace('{issuesSummary}', issuesSummary); return { evaluationType: 'convergence', sessionId: args.sessionId, systemPrompt: CONVERGENCE_SYSTEM_PROMPT, userPrompt: prompt, instructions: 'Send this prompt to an LLM, then call elenchus_submit_llm_evaluation with the response.', }; } async function evaluateSeverity(args: z.infer<typeof EvaluateSeveritySchema>) { const session = await getSession(args.sessionId); if (!session) { return { error: 'Session not found' }; } const issue = session.issues.find(i => i.id === args.issueId); if (!issue) { return { error: 'Issue not found' }; } const prompt = SEVERITY_PROMPT_TEMPLATE .replace('{issueId}', issue.id) .replace('{category}', issue.category) .replace('{severity}', issue.severity) .replace('{description}', issue.description) .replace('{codeContext}', args.codeContext || issue.evidence || 'No additional context'); return { evaluationType: 'severity', sessionId: args.sessionId, issueId: args.issueId, systemPrompt: 'You are an expert security and code quality analyst.', userPrompt: prompt, instructions: 'Send this prompt to an LLM, then call elenchus_submit_llm_evaluation with the response.', }; } async function evaluateEdgeCases(args: z.infer<typeof EvaluateEdgeCasesSchema>) { const session = await getSession(args.sessionId); if (!session) { return { error: 'Session not found' }; } const context = buildEvaluatorContext(session); const prompt = EDGE_CASE_PROMPT_TEMPLATE .replace('{targetFiles}', context.targetFiles.slice(0, 20).join('\n')) .replace('{verificationOutputs}', truncate(context.allOutputs, 6000)); return { evaluationType: 'edgeCases', sessionId: args.sessionId, systemPrompt: 'You are an expert at evaluating test coverage and edge case analysis.', userPrompt: prompt, instructions: 'Send this prompt to an LLM, then call elenchus_submit_llm_evaluation with the response.', }; } async function submitLLMEvaluation(args: z.infer<typeof SubmitLLMEvaluationSchema>) { const session = await getSession(args.sessionId); if (!session) { return { error: 'Session not found' }; } try { // Parse LLM response const jsonMatch = args.llmResponse.match(/```(?:json)?\s*([\s\S]*?)```/) || args.llmResponse.match(/(\{[\s\S]*\})/); if (!jsonMatch) { return { error: 'Could not parse JSON from LLM response' }; } const evaluation = JSON.parse(jsonMatch[1].trim()); // Store evaluation result in session if (!session.llmEvalResults) { session.llmEvalResults = {}; } switch (args.evaluationType) { case 'convergence': session.llmEvalResults.convergence = { qualityScore: evaluation.qualityScore || 0, categoryScores: evaluation.categoryScores || {}, gaps: evaluation.gaps || [], moreRoundsRecommended: evaluation.moreRoundsRecommended || false, evaluatedAt: new Date().toISOString(), }; break; case 'severity': if (!session.llmEvalResults.severityAdjustments) { session.llmEvalResults.severityAdjustments = []; } if (args.targetId && evaluation.adjustment?.direction !== 'keep') { session.llmEvalResults.severityAdjustments.push({ issueId: args.targetId, originalSeverity: session.issues.find(i => i.id === args.targetId)?.severity || 'UNKNOWN', adjustedSeverity: evaluation.severity, reason: evaluation.adjustment?.reason || evaluation.reasoning, }); } break; case 'edgeCases': session.llmEvalResults.edgeCaseCoverage = { coverageScore: evaluation.coverageScore || 0, analyzedCases: evaluation.analyzedCases?.length || 0, missingCritical: evaluation.missingCases?.filter((c: { importance: string }) => c.importance === 'critical').length || 0, }; break; } return { success: true, evaluationType: args.evaluationType, result: evaluation, stored: true, }; } catch (error) { return { error: `Failed to parse evaluation: ${error instanceof Error ? error.message : 'Unknown error'}`, }; } } function truncate(text: string, maxLength: number): string { if (text.length <= maxLength) return text; const half = Math.floor(maxLength / 2); return text.slice(0, half) + '\n\n... [truncated] ...\n\n' + text.slice(-half); } // ============================================================================= // Export Tools // ============================================================================= export const llmEvalTools = { elenchus_evaluate_convergence: { description: 'Get LLM evaluation prompt for convergence quality assessment. Returns a prompt to send to an LLM.', schema: EvaluateConvergenceSchema, handler: evaluateConvergence, }, elenchus_evaluate_severity: { description: 'Get LLM evaluation prompt for issue severity assessment. Returns a prompt to send to an LLM.', schema: EvaluateSeveritySchema, handler: evaluateSeverity, }, elenchus_evaluate_edge_cases: { description: 'Get LLM evaluation prompt for edge case coverage. Returns a prompt to send to an LLM.', schema: EvaluateEdgeCasesSchema, handler: evaluateEdgeCases, }, elenchus_submit_llm_evaluation: { description: 'Submit LLM evaluation response. Call this after receiving an LLM response to an evaluation prompt.', schema: SubmitLLMEvaluationSchema, handler: submitLLMEvaluation, }, };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jhlee0409/elenchus-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

llm-eval-tools.ts•10.4 KiB