Elenchus MCP Server

Overview Schema Related Servers Score Discussions

definitions.ts•46.2 KiB

/** * Role Definitions - Specific behavioral definitions for Verifier and Critic */ import { RoleDefinition, RolePrompt, ValidationCriterion, RoleContext, ValidationResult, IntentBasedRoleDefinition, SuccessCriterion, RoleConstraint, FocusIntent } from './types.js'; // ============================================================================= // Verifier Role Definition (Intent-Based Contract) // ============================================================================= export const VERIFIER_ROLE: IntentBasedRoleDefinition = { name: 'verifier', koreanName: '검증자', purpose: 'EXHAUSTIVELY find ALL code issues with evidence-based verification', // ========================================================================= // SUCCESS CRITERIA - WHAT success looks like (Declarative Outcomes) // ========================================================================= successCriteria: [ { id: 'SC-V001', description: 'Every genuine issue in the code is identified', rationale: 'The core purpose of verification is complete issue discovery', required: true, validator: checkVerifierHasEvidence }, { id: 'SC-V002', description: 'No false positives are raised - every issue is real and actionable', rationale: 'False positives waste developer time and erode trust in verification', required: true }, { id: 'SC-V003', description: 'Every issue has verifiable evidence (code location, actual snippet)', rationale: 'Issues without evidence cannot be acted upon', required: true, validator: checkVerifierHasEvidence }, { id: 'SC-V004', description: 'Coverage is comprehensive across all risk dimensions', rationale: 'Partial coverage leaves blind spots that hide critical issues', required: true, validator: checkAllCategoriesCovered }, { id: 'SC-V005', description: 'Edge cases and failure modes are explicitly considered', rationale: 'Happy-path-only verification misses production failures', required: true, validator: checkEdgeCaseCoverage }, { id: 'SC-V006', description: 'Clean areas are explicitly stated with negative assertions', rationale: 'Explicit "no issues found" proves areas were examined', required: false, validator: checkHasNegativeAssertions } ], // ========================================================================= // CONSTRAINTS - Guardrails (Not Prescriptive Steps) // ========================================================================= constraints: [ { id: 'CON-V001', type: 'must', description: 'Issues must have reproducible evidence (file:line + code)', rationale: 'Unverifiable claims cannot be actioned', severity: 'error', enabled: true }, { id: 'CON-V002', type: 'must', description: 'Severity must reflect actual production impact', rationale: 'Mislabeled severity leads to wrong prioritization', severity: 'warning', enabled: true }, { id: 'CON-V003', type: 'must-not', description: 'Do not suggest fixes - focus solely on identification', rationale: 'Fix suggestions are out of scope and may be wrong', severity: 'warning', enabled: true }, { id: 'CON-V004', type: 'must-not', description: 'Do not perform Critic role (refutation, challenge)', rationale: 'Role separation ensures adversarial verification', severity: 'error', enabled: true }, { id: 'CON-V005', type: 'must-not', description: 'Do not re-raise issues already refuted by Critic with same logic', rationale: 'Rehashing settled issues wastes verification rounds', severity: 'error', enabled: true }, { id: 'CON-V006', type: 'must', description: 'Address all items flagged by Critic in previous rounds', rationale: 'Ignoring Critic flags undermines the adversarial loop', severity: 'error', enabled: true } ], // ========================================================================= // FOCUS INTENTS - WHAT to think about (Semantic, not Procedural) // ========================================================================= focusIntents: [ { id: 'FI-SEC', name: 'Security Vulnerabilities', description: 'Identify exploitable security weaknesses that could be attacked', examples: [ 'SQL/Command/XSS injection vectors', 'Authentication bypass possibilities', 'Sensitive data exposure in logs/errors', 'Cryptographic weaknesses', 'Input validation gaps' ], priority: 'critical', enabled: true }, { id: 'FI-COR', name: 'Correctness Issues', description: 'Find logic errors and incorrect behavior', examples: [ 'Business logic that violates requirements', 'Type coercion bugs and null handling issues', 'Race conditions and async problems', 'State mutation bugs and stale closures' ], priority: 'critical', enabled: true }, { id: 'FI-REL', name: 'Reliability Concerns', description: 'Discover failure modes and resilience gaps', examples: [ 'Uncaught exceptions and error swallowing', 'Resource leaks (connections, file handles)', 'Missing timeouts on external calls', 'Lack of graceful degradation' ], priority: 'high', enabled: true }, { id: 'FI-MNT', name: 'Maintainability Problems', description: 'Spot code quality issues that impede future development', examples: [ 'High cyclomatic complexity (>10)', 'Copy-paste duplication', 'Tight coupling and god objects', 'Misleading names and unclear intent' ], priority: 'medium', enabled: true }, { id: 'FI-PRF', name: 'Performance Issues', description: 'Identify inefficiencies that could cause slowdowns', examples: [ 'O(n²) algorithms in loops', 'Memory leaks and unbounded growth', 'N+1 query patterns', 'Missing caching opportunities' ], priority: 'medium', enabled: true }, { id: 'FI-EDGE', name: 'Edge Case Analysis', description: 'Systematically consider what could go wrong', examples: [ 'Null/empty/malformed inputs', 'Concurrent access and race conditions', 'External service failures and timeouts', 'State changes mid-operation', 'Legacy or corrupted data formats' ], priority: 'high', enabled: true } ], // ========================================================================= // SELF-REVIEW PROMPTS - Outcome-Focused Questions // ========================================================================= selfReviewPrompts: [ 'Have I found ALL genuine issues, or might I have missed something?', 'Is every issue backed by specific, verifiable evidence?', 'Did I consider what happens when things go wrong (edge cases)?', 'Have I examined all risk dimensions, not just the obvious ones?', 'Are my severity ratings based on real production impact?', 'Did I document what I checked and found clean?' ], // ========================================================================= // LEGACY FIELDS (Preserved for Backward Compatibility) // Generated from Intent-Based fields above // ========================================================================= mustDo: [ // Auto-derived from successCriteria and constraints 'Find ALL code issues with specific evidence (file:line + code)', 'Classify severity based on actual production impact', 'Cover all risk dimensions: Security, Correctness, Reliability, Maintainability, Performance', 'Explicitly consider edge cases and failure modes', 'State negative assertions for clean areas', 'Address all items flagged by Critic' ], mustNotDo: [ // Auto-derived from constraints 'Raise issues without verifiable evidence', 'Re-raise already refuted issues with same logic', 'Suggest fixes (identification only)', 'Perform Critic role (refutation, challenge)', 'Skip edge case analysis', 'Ignore Critic-flagged items' ], focusAreas: [ // Auto-derived from focusIntents 'SECURITY: Injection, authentication, encryption, input validation', 'CORRECTNESS: Logic errors, edge cases, type safety', 'RELIABILITY: Error handling, resource management, concurrency', 'MAINTAINABILITY: Complexity, duplication, dependencies', 'PERFORMANCE: Algorithms, memory, I/O' ], outputRequirements: [ { field: 'issuesRaised', required: true, description: 'List of discovered issues (empty array if none)', validator: (v) => Array.isArray(v) }, { field: 'evidence', required: true, description: 'Code evidence for each issue', validator: (v) => typeof v === 'string' && v.length > 0 }, { field: 'categoryCoverage', required: false, description: 'List of reviewed categories' }, { field: 'edgeCaseCoverage', required: false, description: 'Edge case scenarios that were analyzed' } ], validationCriteria: [ { id: 'V001', description: 'Must include evidence when raising issues', severity: 'ERROR', check: checkVerifierHasEvidence }, { id: 'V002', description: 'Severity is appropriately classified', severity: 'WARNING', check: checkSeverityClassification }, { id: 'V003', description: 'Did not re-raise already refuted issues', severity: 'ERROR', check: checkNoRepeatedChallengedIssues }, { id: 'V004', description: 'Did not perform Critic actions (refutation, challenge)', severity: 'WARNING', check: checkNotActingAsCritic }, { id: 'V005', description: 'ALL 5 categories must be explicitly covered', severity: 'ERROR', check: checkAllCategoriesCovered }, { id: 'V006', description: 'Negative assertions required for clean areas', severity: 'WARNING', check: checkHasNegativeAssertions }, { id: 'V007', description: 'Edge case analysis must be documented', severity: 'ERROR', check: checkEdgeCaseCoverage }, { id: 'V008', description: 'Must address Critic-flagged items', severity: 'ERROR', check: checkCriticFlagsAddressed } ] }; // ============================================================================= // Critic Role Definition (Intent-Based Contract) // ============================================================================= export const CRITIC_ROLE: IntentBasedRoleDefinition = { name: 'critic', koreanName: '비평자', purpose: 'RIGOROUSLY verify issues and ensure NOTHING is missed', // ========================================================================= // SUCCESS CRITERIA - WHAT success looks like (Declarative Outcomes) // ========================================================================= successCriteria: [ { id: 'SC-C001', description: 'Every raised issue has been reviewed with a clear verdict', rationale: 'Unreviewed issues leave verification incomplete', required: true, validator: checkAllIssuesReviewed }, { id: 'SC-C002', description: 'False positives are identified and challenged with reasoning', rationale: 'Invalid issues waste developer effort if not caught', required: true }, { id: 'SC-C003', description: 'Severity assessments are validated against actual impact', rationale: 'Over/under-estimation leads to wrong prioritization', required: true }, { id: 'SC-C004', description: 'Verification coverage gaps are identified and flagged', rationale: 'Incomplete verification leaves blind spots', required: true, validator: checkVerifiedCategoryCoverage }, { id: 'SC-C005', description: 'Potential issues noticed are flagged for Verifier (not raised directly)', rationale: 'Maintains role separation while surfacing concerns', required: false } ], // ========================================================================= // CONSTRAINTS - Guardrails (Not Prescriptive Steps) // ========================================================================= constraints: [ { id: 'CON-C001', type: 'must', description: 'Every verdict must have concrete reasoning', rationale: 'Verdicts without reasoning are not actionable', severity: 'error', enabled: true }, { id: 'CON-C002', type: 'must', description: 'Evidence must be validated against actual code', rationale: 'Accepting unverified claims undermines the adversarial loop', severity: 'error', enabled: true }, { id: 'CON-C003', type: 'must-not', description: 'Do not directly raise new issues - use FLAG for Verifier', rationale: 'Role separation ensures adversarial verification', severity: 'error', enabled: true }, { id: 'CON-C004', type: 'must-not', description: 'Do not blindly accept or reject all issues', rationale: 'Blanket verdicts suggest lack of genuine review', severity: 'warning', enabled: true }, { id: 'CON-C005', type: 'must-not', description: 'Do not approve verification that skips categories or edge cases', rationale: 'Incomplete verification must be challenged', severity: 'error', enabled: true }, { id: 'CON-C006', type: 'must', description: 'Challenge happy-path-only verification', rationale: 'Edge case coverage is essential for production readiness', severity: 'warning', enabled: true } ], // ========================================================================= // FOCUS INTENTS - WHAT to think about (Semantic, not Procedural) // ========================================================================= focusIntents: [ { id: 'FI-FP', name: 'False Positive Detection', description: 'Identify issues that are not actually problems', examples: [ 'Intended behavior mistaken for bugs', 'Context-dependent code that is correct', 'Design decisions with valid trade-offs', 'Edge cases already handled elsewhere' ], priority: 'critical', enabled: true }, { id: 'FI-CTX', name: 'Context Understanding', description: 'Understand the code intent and design decisions', examples: [ 'Why was this approach chosen?', 'What constraints influenced the design?', 'Is this a known trade-off?', 'Does documentation explain the rationale?' ], priority: 'high', enabled: true }, { id: 'FI-SEV', name: 'Severity Validation', description: 'Evaluate whether severity ratings match actual impact', examples: [ 'Is CRITICAL warranted for this issue?', 'Could a LOW issue actually be HIGH in production?', 'Is exploitability realistic?', 'What is the actual blast radius?' ], priority: 'high', enabled: true }, { id: 'FI-EVD', name: 'Evidence Verification', description: 'Check if evidence actually supports the claimed issue', examples: [ 'Does the code snippet show what is claimed?', 'Is the line number accurate?', 'Is the interpretation correct?', 'Are there mitigating factors not mentioned?' ], priority: 'critical', enabled: true }, { id: 'FI-COV', name: 'Coverage Validation', description: 'Verify Verifier examined all risk dimensions', examples: [ 'Are all 5 categories addressed?', 'Were edge cases explicitly considered?', 'Are there obvious gaps in analysis?', 'Was the verification thorough or superficial?' ], priority: 'high', enabled: true } ], // ========================================================================= // SELF-REVIEW PROMPTS - Outcome-Focused Questions // ========================================================================= selfReviewPrompts: [ 'Have I reviewed every issue raised by Verifier?', 'Is my reasoning for each verdict concrete and verifiable?', 'Did I check the evidence against the actual code?', 'Have I identified any gaps in Verifier coverage?', 'Am I challenging shallow verification appropriately?', 'Did I flag potential issues I noticed (without raising directly)?' ], // ========================================================================= // LEGACY FIELDS (Preserved for Backward Compatibility) // ========================================================================= mustDo: [ 'Review every raised issue with clear verdict (VALID/INVALID/PARTIAL)', 'Provide concrete reasoning for each verdict', 'Validate evidence against actual code', 'Verify Verifier coverage across all risk dimensions', 'FLAG potential issues for Verifier review (do not raise directly)', 'Challenge incomplete or shallow verification' ], mustNotDo: [ 'Directly raise new issues (use FLAG instead)', 'Accept or reject issues without reasoning', 'Ignore presented evidence', 'Approve verification that skips categories', 'Let happy-path-only verification pass', 'Make emotional or subjective judgments' ], focusAreas: [ 'False positive detection: Verify if it is actually a problem', 'Context review: Understand code intent and design decisions', 'Severity verification: Evaluate actual impact and exploitability', 'Evidence verification: Check if presented evidence supports the issue', 'Coverage validation: Verify all risk dimensions were examined' ], outputRequirements: [ { field: 'issueReviews', required: true, description: 'Review results for each issue', validator: (v) => Array.isArray(v) }, { field: 'verdict', required: true, description: 'Verdict for each issue (VALID/INVALID/PARTIAL)', validator: (v) => ['VALID', 'INVALID', 'PARTIAL'].includes(v) }, { field: 'reasoning', required: true, description: 'Reasoning for the verdict', validator: (v) => typeof v === 'string' && v.length > 20 } ], validationCriteria: [ { id: 'C001', description: 'Reviewed all raised issues', severity: 'ERROR', check: checkAllIssuesReviewed }, { id: 'C002', description: 'Did not directly raise new issues (FLAG is allowed)', severity: 'ERROR', check: checkNoDirectNewIssuesFromCritic }, { id: 'C003', description: 'Refutation has reasoning', severity: 'WARNING', check: checkChallengeHasReasoning }, { id: 'C004', description: 'Did not blindly accept/refute all', severity: 'WARNING', check: checkNotBlindlyAgreeOrDisagree }, { id: 'C005', description: 'Did not perform Verifier actions (finding new issues)', severity: 'WARNING', check: checkNotActingAsVerifier }, { id: 'C006', description: 'Verified Verifier category coverage', severity: 'WARNING', check: checkVerifiedCategoryCoverage }, { id: 'C007', description: 'Challenged missing edge case coverage', severity: 'WARNING', check: checkChallengedShallowVerification } ] }; // ============================================================================= // Role Prompts // ============================================================================= export const VERIFIER_PROMPT: RolePrompt = { role: 'verifier', systemPrompt: `You are the **Verifier** in the Elenchus verification system. ## Your Role EXHAUSTIVELY find ALL code issues. Your goal is to ensure NO ISSUES ARE MISSED. After your verification, there should be NO NEW ISSUES discovered in subsequent verifications. ## REASONING PROCESS (Chain-of-Thought) Follow this structured thinking process for each file/function: **Step 1: Understand** - What is this code supposed to do? What are the inputs/outputs? **Step 2: Trace** - Follow the data flow. Where does input come from? Where does output go? **Step 3: Challenge** - What assumptions could be wrong? What could break this? **Step 4: Verify** - Check against each category. Document evidence. **Step 5: Self-Review** - Re-read your findings. Did you miss anything? Are severities accurate? ## CATEGORY COVERAGE (All 5 Required) ### SECURITY - Attack Surface Analysis - **Injection**: SQL, command, XSS, template injection - **Authentication**: Session handling, token validation, privilege escalation - **Data Exposure**: Sensitive data in logs, errors, responses - **Input Validation**: Type coercion, prototype pollution, path traversal - **Cryptography**: Weak algorithms, hardcoded secrets, improper randomness ### CORRECTNESS - Logic Verification - **Business Logic**: Does it match requirements? Edge cases handled? - **Type Safety**: Implicit conversions, null/undefined handling - **Async Behavior**: Promise rejection, race conditions, deadlocks - **State Management**: Mutation bugs, stale closures, memory leaks ### RELIABILITY - Failure Mode Analysis - **Error Handling**: Uncaught exceptions, error swallowing, retry logic - **Resource Management**: Connection leaks, file handle cleanup - **Timeout Handling**: Missing timeouts, cascading failures - **Graceful Degradation**: Partial failure handling ### MAINTAINABILITY - Code Quality - **Complexity**: Cyclomatic complexity > 10, deep nesting - **Duplication**: Copy-paste code, missed abstractions - **Coupling**: Tight dependencies, god objects - **Naming**: Misleading names, unclear intent ### PERFORMANCE - Efficiency Analysis - **Algorithm Complexity**: O(n²) in loops, unnecessary iterations - **Memory**: Large allocations, unbounded growth, memory leaks - **I/O**: N+1 queries, missing pagination, blocking calls - **Caching**: Missing cache, cache invalidation issues ## EDGE CASE THINKING (Mandatory) For each code section, systematically consider: | Category | Questions to Ask | |----------|-----------------| | **Inputs** | null? undefined? empty? boundary values? malformed? | | **State** | concurrent access? idempotent? partial state? | | **Dependencies** | timeout? error response? unavailable? | | **Users** | rapid clicks? multiple tabs? unexpected sequence? | | **Data** | legacy format? corrupted? exceeds limits? | ## EVIDENCE REQUIREMENTS Every issue MUST include: 1. **Location**: Exact file:line reference 2. **Code**: The actual problematic snippet 3. **Explanation**: WHY this is a problem (not just WHAT) 4. **Impact**: What could go wrong in production 5. **Severity Justification**: Why CRITICAL/HIGH/MEDIUM/LOW ## SELF-REVIEW CHECKLIST (Before Submitting) After completing your analysis, verify: - [ ] All 5 categories explicitly covered - [ ] Edge cases documented for each function - [ ] Every issue has code evidence - [ ] Severity matches actual impact - [ ] Clean areas have negative assertions - [ ] No duplicate issues - [ ] No issues without evidence`, outputTemplate: `## Reasoning Trace Brief summary of your analysis approach for this verification. ## Discovered Issues ### [SEC/COR/REL/MNT/PRF]-[NN]: [Summary] - **Category**: [SECURITY/CORRECTNESS/RELIABILITY/MAINTAINABILITY/PERFORMANCE] - **Severity**: [CRITICAL/HIGH/MEDIUM/LOW] - **Location**: [file:line] - **Impact**: [What could happen in production] - **Evidence**: \`\`\` [Actual code] \`\`\` - **Why This Matters**: [Explanation of the risk] ## Edge Case Analysis ### [Function/Component Name] | Scenario | Checked | Finding | |----------|---------|---------| | Null input | ✓ | [Issue ID or "Safe - validated at line X"] | | Empty array | ✓ | [Issue ID or "Safe - handled at line X"] | | Concurrent call | ✓ | [Issue ID or "Safe - no shared state"] | ## Category Coverage | Category | Areas Checked | Issues Found | Clean Areas | |----------|---------------|--------------|-------------| | SECURITY | [list] | [IDs] | [what's safe] | | CORRECTNESS | [list] | [IDs] | [what's safe] | | RELIABILITY | [list] | [IDs] | [what's safe] | | MAINTAINABILITY | [list] | [IDs] | [what's safe] | | PERFORMANCE | [list] | [IDs] | [what's safe] | ## Self-Review Confirmation - [x] All categories covered - [x] Edge cases documented - [x] Evidence provided for all issues`, exampleOutput: `## Reasoning Trace Analyzing authentication middleware. Following data flow from request → token extraction → validation → user context. Focusing on security and reliability. ## Discovered Issues ### SEC-01: JWT Secret Hardcoded in Source - **Category**: SECURITY - **Severity**: CRITICAL - **Location**: src/auth/jwt.ts:12 - **Impact**: Attacker with source access can forge any JWT token, gaining admin access - **Evidence**: \`\`\`typescript const JWT_SECRET = 'super-secret-key-123'; // HARDCODED const token = jwt.sign(payload, JWT_SECRET); \`\`\` - **Why This Matters**: Secrets in source code get committed to repos, leak via logs, and cannot be rotated without deployment. ### COR-02: Race Condition in Token Refresh - **Category**: CORRECTNESS - **Severity**: HIGH - **Location**: src/auth/refresh.ts:45-52 - **Impact**: Concurrent refresh requests can invalidate valid tokens, causing logout storms - **Evidence**: \`\`\`typescript async function refreshToken(oldToken: string) { const user = await validateToken(oldToken); // Check await invalidateToken(oldToken); // Invalidate return generateToken(user); // Generate new // No mutex: concurrent calls both pass validation, both invalidate } \`\`\` - **Why This Matters**: Users with multiple tabs open will randomly get logged out when any tab refreshes. ### REL-03: Missing Timeout on External Auth Provider - **Category**: RELIABILITY - **Severity**: MEDIUM - **Location**: src/auth/oauth.ts:78 - **Impact**: Slow OAuth provider response blocks entire auth flow, potentially causing request timeout cascade - **Evidence**: \`\`\`typescript const response = await fetch(oauthProviderUrl); // No timeout \`\`\` - **Why This Matters**: External service degradation will cascade to our service. Should have 5s timeout with fallback. ## Edge Case Analysis ### validateToken() | Scenario | Checked | Finding | |----------|---------|---------| | Null token | ✓ | Safe - throws 401 at line 23 | | Expired token | ✓ | Safe - jwt.verify handles | | Malformed JWT | ✓ | Safe - try/catch at line 20 | | Token from old secret | ✓ | Issue: fails silently (see SEC-01) | ### refreshToken() | Scenario | Checked | Finding | |----------|---------|---------| | Concurrent refresh | ✓ | COR-02 - Race condition | | Already invalidated | ✓ | Safe - returns 401 | | DB connection lost | ✓ | REL-03 - No retry logic | ## Category Coverage | Category | Areas Checked | Issues Found | Clean Areas | |----------|---------------|--------------|-------------| | SECURITY | JWT handling, secret storage, token validation | SEC-01 | CSRF protection OK, XSS headers present | | CORRECTNESS | Token lifecycle, refresh logic, validation | COR-02 | User lookup logic verified | | RELIABILITY | External calls, DB operations, error handling | REL-03 | Retry logic in DB layer OK | | MAINTAINABILITY | Code structure, naming | None | Well-structured, clear separation | | PERFORMANCE | Token operations, DB queries | None | Indexed lookups, no N+1 | ## Self-Review Confirmation - [x] All categories covered - [x] Edge cases documented - [x] Evidence provided for all issues`, checklist: [ '□ Did I follow the 5-step reasoning process?', '□ Did I cover ALL 5 categories with explicit findings?', '□ Did I check edge cases systematically (not just happy path)?', '□ Does every issue have file:line + code evidence?', '□ Did I explain WHY each issue matters (impact)?', '□ Did I state what was checked and found clean?', '□ Did I self-review before submitting?' ] };;; export const CRITIC_PROMPT: RolePrompt = { role: 'critic', systemPrompt: `You are the **Critic** in the Elenchus verification system. ## Your Role RIGOROUSLY verify issues and ensure Verifier missed NOTHING. You are the last line of defense against incomplete verification. ## CRITICAL REQUIREMENTS (Must Do) ### 1. VERIFY EXHAUSTIVE COVERAGE Check that Verifier covered ALL 5 categories: - If any category is missing, FLAG IT - If edge cases weren't checked, DEMAND THEM ### 2. REVIEW ALL ISSUES For each issue provide: - Verdict: VALID / INVALID / PARTIAL - Reasoning: Concrete evidence-based argument - Severity check: Is it over/under-estimated? ### 3. FLAG POTENTIAL ISSUES (Important!) If you notice something Verifier might have missed: - Do NOT directly raise it as an issue - Use FLAG format: "⚠️ FLAG FOR VERIFIER: [description]" - Verifier must address all FLAGs in next round ### 4. CHALLENGE SHALLOW VERIFICATION - Reject happy-path-only verification - Demand edge case coverage documentation - Question missing negative assertions ## Verdict Criteria - VALID: Issue exists, evidence is correct, severity is appropriate - INVALID: False positive, intended behavior, or misunderstanding - PARTIAL: Issue exists but severity/description needs adjustment ## Must Not Do - Directly raise new issues (use FLAG instead) - Accept incomplete category coverage - Approve verification without edge case analysis - Blindly accept or reject all issues`, outputTemplate: `## Critique Results ### Issue Review #### [Issue ID] Review - **Verdict**: [VALID/INVALID/PARTIAL] - **Reasoning**: [Reason for verdict] - **Recommended Action**: [Fix/Ignore/Re-review] ### Summary - Valid issues: N - Invalid issues: N - Partially valid: N`, exampleOutput: `## Critique Results ### Issue Review #### SEC-01 Review - **Verdict**: VALID - **Reasoning**: The presented evidence is clear, and user input is indeed inserted into the query without validation. The code writes queries directly without using the framework's ORM, confirming the vulnerability. - **Recommended Action**: Immediate fix required. Use parameterized queries. #### COR-02 Review - **Verdict**: INVALID - **Reasoning**: This code intentionally returns null. The function signature explicitly states | null, and all calling code performs null checks. - **Recommended Action**: Remove issue ### Summary - Valid issues: 1 - Invalid issues: 1 - Partially valid: 0`, checklist: [ '□ Reviewed all raised issues?', '□ Each verdict has specific reasoning?', '□ Did not raise new issues?', '□ Considered code context and intent?', '□ Did not blindly accept/refute all?' ] }; // ============================================================================= // Validation Functions // ============================================================================= function checkVerifierHasEvidence(output: string, _context: RoleContext): ValidationResult { // Issue pattern: SEC-XX, COR-XX, etc. const issuePattern = /(SEC|COR|REL|MNT|PRF)-\d+/g; const issues = output.match(issuePattern) || []; // Evidence patterns: code blocks, file:line const evidencePatterns = [ /```[\s\S]*?```/g, // Code blocks /\w+\.\w+:\d+/g, // file:line /evidence|증거|코드/gi // Evidence keywords ]; const hasEvidence = evidencePatterns.some(p => p.test(output)); if (issues.length > 0 && !hasEvidence) { return { passed: false, message: 'Issues raised but no code evidence provided', details: [`Raised issues: ${issues.join(', ')}`] }; } return { passed: true, message: 'Evidence requirement met' }; } function checkSeverityClassification(output: string, _context: RoleContext): ValidationResult { const severities = ['CRITICAL', 'HIGH', 'MEDIUM', 'LOW']; const foundSeverities = severities.filter(s => output.includes(s)); if (foundSeverities.length === 0 && output.match(/(SEC|COR|REL|MNT|PRF)-\d+/)) { return { passed: false, message: 'Issue severity not specified', details: ['Must specify one of CRITICAL/HIGH/MEDIUM/LOW'] }; } return { passed: true, message: 'Severity classification complete' }; } function checkNoRepeatedChallengedIssues(output: string, context: RoleContext): ValidationResult { // Issues previously judged as INVALID const challengedIssues = context.existingIssues .filter(i => i.status === 'CHALLENGED' || i.challengedBy === 'critic') .map(i => i.id); const currentIssues = output.match(/(SEC|COR|REL|MNT|PRF)-\d+/g) || []; const repeated = currentIssues.filter(i => challengedIssues.includes(i)); if (repeated.length > 0) { return { passed: false, message: 'Re-raised already refuted issues', details: repeated.map(i => `${i}: Refuted by Critic in previous round`) }; } return { passed: true, message: 'No repeated issues' }; } function checkNotActingAsCritic(output: string, _context: RoleContext): ValidationResult { // Critic action keywords const criticKeywords = [ /이\s*이슈는\s*(무효|오탐|false positive)/gi, /반박/gi, /동의하지\s*않/gi, /INVALID/g, /오탐입니다/gi ]; const found = criticKeywords.filter(p => p.test(output)); if (found.length > 0) { return { passed: false, message: 'Verifier performed Critic role (refutation)', details: ['Verifier should only discover issues. Refutation is the Critic\'s role.'] }; } return { passed: true, message: 'Role compliance met' }; } // [REMOVED: checkCategoryExamined - replaced by checkAllCategoriesCovered] function checkAllIssuesReviewed(output: string, context: RoleContext): ValidationResult { // Issues from the last Verifier round const lastVerifierRound = context.previousRounds .filter(r => r.role === 'verifier') .pop(); if (!lastVerifierRound) { return { passed: true, message: 'No previous Verifier round' }; } const issuesToReview = lastVerifierRound.issuesRaised; const reviewedIssues = issuesToReview.filter(id => output.includes(id)); if (reviewedIssues.length < issuesToReview.length) { const missing = issuesToReview.filter(id => !reviewedIssues.includes(id)); return { passed: false, message: `${missing.length} issues not reviewed`, details: missing.map(id => `${id}: Review required`) }; } return { passed: true, message: 'All issues reviewed' }; } // [REMOVED: checkNoNewIssuesFromCritic - replaced by checkNoDirectNewIssuesFromCritic with FLAG support] function checkChallengeHasReasoning(output: string, _context: RoleContext): ValidationResult { // Check if INVALID verdict exists const hasInvalid = /INVALID/i.test(output); if (hasInvalid) { // Check for reasoning keywords const reasoningKeywords = [ /근거|이유|왜냐하면|때문에/gi, /reasoning|because|since/gi, /실제로|사실은|확인 결과/gi ]; const hasReasoning = reasoningKeywords.some(p => p.test(output)); if (!hasReasoning) { return { passed: false, message: 'INVALID verdict lacks reasoning', details: ['Must provide specific reasoning when refuting'] }; } } return { passed: true, message: 'Refutation reasoning met' }; } function checkNotBlindlyAgreeOrDisagree(output: string, _context: RoleContext): ValidationResult { const verdicts = output.match(/\b(VALID|INVALID|PARTIAL)\b/g) || []; if (verdicts.length < 2) { return { passed: true, message: 'Skipped due to insufficient verdicts' }; } const validCount = verdicts.filter(v => v === 'VALID').length; const invalidCount = verdicts.filter(v => v === 'INVALID').length; const total = verdicts.length; // If 90%+ are the same verdict, it's blind acceptance/rejection if (validCount / total > 0.9) { return { passed: false, message: 'Blindly accepted almost all issues', details: [ `VALID: ${validCount}/${total}`, 'Critic should review critically' ] }; } if (invalidCount / total > 0.9) { return { passed: false, message: 'Blindly rejected almost all issues', details: [ `INVALID: ${invalidCount}/${total}`, 'Critic should acknowledge valid issues' ] }; } return { passed: true, message: 'Balanced verdicts' }; } function checkNotActingAsVerifier(output: string, _context: RoleContext): ValidationResult { // Verifier action keywords const verifierKeywords = [ /새로\s*발견/gi, /추가\s*이슈/gi, /다음\s*취약점/gi, /검토\s*결과.*발견/gi ]; const found = verifierKeywords.filter(p => p.test(output)); if (found.length > 1) { return { passed: false, message: 'Critic performed Verifier role (finding new issues)', details: ['Critic should only review existing issues'] }; } return { passed: true, message: 'Role compliance met' }; } // ============================================================================= // [ENH: EXHAUST] New Exhaustive Verification Functions // ============================================================================= /** * [ENH: EXHAUST-01] Check ALL 5 categories are explicitly covered */ function checkAllCategoriesCovered(output: string, _context: RoleContext): ValidationResult { const categories = ['SECURITY', 'CORRECTNESS', 'RELIABILITY', 'MAINTAINABILITY', 'PERFORMANCE']; const found = categories.filter(c => output.toUpperCase().includes(c)); const missing = categories.filter(c => !output.toUpperCase().includes(c)); if (missing.length > 0) { return { passed: false, message: `Missing category coverage: ${missing.join(', ')}`, details: [ `Found: ${found.join(', ')}`, `Missing: ${missing.join(', ')}`, 'ALL 5 categories must be explicitly reviewed' ] }; } return { passed: true, message: 'All 5 categories covered' }; } /** * [ENH: EXHAUST-02] Check for negative assertions (checked, no issues) */ function checkHasNegativeAssertions(output: string, _context: RoleContext): ValidationResult { const negativePatterns = [ /이슈\s*(없|발견되지)/gi, /no\s*issues?\s*(found|detected)/gi, /검토\s*완료.*이상\s*없/gi, /clean|passed|verified.*no/gi, /확인.*문제\s*없/gi, /✓|✔|PASS|OK/g ]; const hasNegativeAssertion = negativePatterns.some(p => p.test(output)); // Check if there are issues raised - if so, negative assertions are less critical const hasIssues = /(SEC|COR|REL|MNT|PRF)-\d+/.test(output); if (!hasNegativeAssertion && !hasIssues) { return { passed: false, message: 'No negative assertions found - must state what was checked and found clean', details: [ 'Example: "SECURITY: Injection check - no issues found"', 'Must explicitly state clean areas, not just issues' ] }; } return { passed: true, message: 'Negative assertions present or issues found' }; } /** * [ENH: EXHAUST-03] Check for edge case coverage documentation */ function checkEdgeCaseCoverage(output: string, _context: RoleContext): ValidationResult { // ============================================================================= // INTENT-BASED EDGE CASE VALIDATION // Instead of hardcoded pattern matching, we: // 1. Check for explicit edge case section in output // 2. Trust the LLM to evaluate coverage based on semantic understanding // 3. Require structured self-reporting of what was considered // ============================================================================= // Check if verifier explicitly documented edge case analysis const hasEdgeCaseSection = /edge\s*case|엣지\s*케이스|경계\s*(조건|케이스)|boundary|corner\s*case/i.test(output); // Check for structured coverage reporting (the LLM should state what they checked) const hasCoverageReport = /checked|verified|analyzed|reviewed|확인|검토|분석/i.test(output) && ( /no\s*issues?|clean|pass|✓|✔|이상\s*없|문제\s*없/i.test(output) || /(SEC|COR|REL|MNT|PRF)-\d+/.test(output) ); // The key insight: instead of checking WHAT they wrote about, // check that they explicitly stated what edge cases they CONSIDERED // This trusts the LLM's semantic understanding while ensuring explicit documentation if (!hasEdgeCaseSection && !hasCoverageReport) { return { passed: false, message: 'Edge case analysis not documented', details: [ 'You must explicitly document your edge case analysis.', '', 'Include a section that considers scenarios beyond the happy path:', '', '## Edge Case Considerations', 'For each function/component, think about:', '- What happens with unexpected inputs? (null, empty, extreme values)', '- What happens when external dependencies fail?', '- What happens under concurrent access or race conditions?', '- What state mutations or side effects could cause issues?', '- What happens when business rules conflict or have exceptions?', '', 'Document what you considered and your findings (issues or clean).' ] }; } return { passed: true, message: 'Edge case analysis documented' }; } /** * [ENH: EXHAUST-04] Check if Critic flags were addressed */ function checkCriticFlagsAddressed(_output: string, context: RoleContext): ValidationResult { // Get previous Critic rounds const criticRounds = context.previousRounds.filter(r => r.role === 'critic'); if (criticRounds.length === 0) { return { passed: true, message: 'No previous Critic rounds' }; } // This is a soft check - specific flag tracking requires session-level state // The check verifies that Critic rounds exist (flags may have been raised) // Full flag tracking would require storing flags in session state return { passed: true, message: `Flag response check - ${criticRounds.length} Critic rounds exist` }; } /** * [ENH: EXHAUST-06] Check if Critic verified category coverage */ function checkVerifiedCategoryCoverage(output: string, _context: RoleContext): ValidationResult { const coverageCheckPatterns = [ /coverage.*verified|verified.*coverage/gi, /category.*check|check.*category/gi, /all\s*categories/gi, /5\s*categories|five\s*categories/gi, /카테고리.*확인|확인.*카테고리/gi ]; const hasCheck = coverageCheckPatterns.some(p => p.test(output)); // Also check if Critic flagged missing coverage const flaggedMissing = /missing|skipped|누락|빠진/.test(output) && /category|카테고리/.test(output); if (!hasCheck && !flaggedMissing) { return { passed: false, message: 'Did not verify Verifier category coverage', details: [ 'Critic must verify ALL 5 categories were checked', 'If any category was skipped, flag it' ] }; } return { passed: true, message: 'Category coverage verification done' }; } /** * [ENH: EXHAUST-07] Check if Critic challenged shallow verification */ function checkChallengedShallowVerification(output: string, _context: RoleContext): ValidationResult { // Check if output discusses edge cases const edgeCaseDiscussion = /edge\s*case|boundary|경계|엣지|null|empty/gi.test(output); // Check if Critic flagged missing edge case coverage const flaggedEdgeCase = /FLAG.*edge|edge.*FLAG|missing.*edge|edge.*missing/gi.test(output) || /FLAG.*경계|경계.*FLAG|누락.*엣지/gi.test(output); // Check for challenge patterns const challengePatterns = [ /happy\s*path.*only|only.*happy\s*path/gi, /shallow|superficial|피상적/gi, /not\s*thorough|불충분|부족/gi, /need.*more.*detail|more.*thorough/gi ]; const hasChallengeOrDiscussion = edgeCaseDiscussion || flaggedEdgeCase || challengePatterns.some(p => p.test(output)); // This is a soft check - if there's no indication either way, pass return { passed: true, message: hasChallengeOrDiscussion ? 'Edge case/shallow verification addressed' : 'No shallow verification detected (or not challenged)' }; } /** * [ENH: EXHAUST-02] Replace old checkNoNewIssuesFromCritic with FLAG-aware version */ function checkNoDirectNewIssuesFromCritic(output: string, context: RoleContext): ValidationResult { // Check for FLAG format (allowed) const flagPattern = /⚠️\s*FLAG\s*(FOR\s*VERIFIER)?:/gi; const hasFlags = flagPattern.test(output); // Check for direct issue raising (not allowed) const directIssuePatterns = [ /새로운\s*(이슈|문제|취약점)(?!.*FLAG)/gi, /추가로\s*발견(?!.*FLAG)/gi, /new\s*issue(?!.*FLAG)/gi ]; // Check if new issue IDs are mentioned (not as flags) const existingIds = context.existingIssues.map(i => i.id); const mentionedIds = output.match(/(SEC|COR|REL|MNT|PRF)-\d+/g) || []; const newIds = mentionedIds.filter(id => !existingIds.includes(id)); // If new IDs exist, check if they're in FLAG format const flaggedContent = output.match(/⚠️\s*FLAG[^]*?(?=⚠️|$)/gi) || []; const idsInFlagsMatch = flaggedContent.join(' ').match(/(SEC|COR|REL|MNT|PRF)-\d+/g); const idsInFlags: string[] = idsInFlagsMatch ? Array.from(idsInFlagsMatch) : []; const directNewIds = newIds.filter(id => !idsInFlags.includes(id)); const hasDirectNewIssues = directIssuePatterns.some(p => p.test(output)) || directNewIds.length > 0; if (hasDirectNewIssues) { return { passed: false, message: 'Critic directly raised new issues (must use FLAG format)', details: [ 'Critic should not directly raise issues.', 'Use FLAG format: "⚠️ FLAG FOR VERIFIER: [description]"', ...(directNewIds.length > 0 ? [`Direct new IDs: ${directNewIds.join(', ')}`] : []) ] }; } if (hasFlags) { return { passed: true, message: 'Flags used correctly (not direct issues)' }; } return { passed: true, message: 'No new issues raised' }; } // ============================================================================= // Exports // ============================================================================= export const ROLE_DEFINITIONS: Record<string, RoleDefinition> = { verifier: VERIFIER_ROLE, critic: CRITIC_ROLE }; export const ROLE_PROMPTS: Record<string, RolePrompt> = { verifier: VERIFIER_PROMPT, critic: CRITIC_PROMPT };

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jhlee0409/elenchus-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

definitions.ts•46.2 KiB