Salesforce Documentation MCP Server

test-llm-judge.ts•15.1 KiB

/** * LLM-as-Judge Test Suite (Phase 3.4) * * Weekly semantic evaluation using LLM to assess search result quality. * This catches semantic drift that keyword matching cannot detect. * * Run: npm run test-llm-judge * * Requirements: * - OPENAI_API_KEY or AZURE_OPENAI_API_KEY environment variable * - Or configure a local LLM endpoint * * How it works: * 1. Runs a subset of critical queries * 2. Sends query + top result to LLM * 3. LLM scores relevance on 1-5 scale with reasoning * 4. Aggregates scores and flags queries below threshold */ import { searchDocuments } from "../src/db/queries.js"; import { initializeDatabase, isDatabaseIndexed } from "../src/db/database.js"; import * as fs from "fs"; import * as path from "path"; // Configuration const LLM_JUDGE_CONFIG = { // Minimum acceptable relevance score from LLM (1-5 scale) minRelevanceScore: 3.5, // Model configuration (update based on your setup) model: process.env.LLM_JUDGE_MODEL || "gpt-4o-mini", apiEndpoint: process.env.LLM_JUDGE_ENDPOINT || "https://api.openai.com/v1/chat/completions", // Results file resultsFile: path.join(process.cwd(), 'data', 'llm-judge-results.json'), // Rate limiting delayBetweenCalls: 1000, // ms }; // Critical test cases for LLM evaluation // These are queries where semantic understanding matters most const LLM_JUDGE_CASES = [ { query: "How do I prevent my trigger from running multiple times?", context: "Developer asking about trigger recursion prevention", expectedTopics: ["static variable", "recursion", "trigger handler pattern"] }, { query: "What's the difference between before and after triggers?", context: "Developer learning Apex triggers", expectedTopics: ["before trigger", "after trigger", "DML operations", "timing"] }, { query: "System.LimitException: Too many SOQL queries: 101", context: "Developer debugging a governor limit error", expectedTopics: ["governor limits", "SOQL", "bulkification", "query optimization"] }, { query: "how do I make an API call from Apex", context: "Developer needs to call external service", expectedTopics: ["HTTP", "callout", "HttpRequest", "named credential"] }, { query: "LWC component not showing on record page", context: "Developer troubleshooting component visibility", expectedTopics: ["target config", "meta xml", "record page", "component visibility"] }, { query: "best way to handle errors in Lightning Web Components", context: "Developer implementing error handling", expectedTopics: ["try-catch", "error handling", "toast", "UI feedback"] }, { query: "what's the heap size limit in Apex", context: "Developer optimizing memory usage", expectedTopics: ["heap", "6MB", "12MB", "async", "governor limits"] }, { query: "how to deploy metadata to production", context: "Developer/Admin deploying changes", expectedTopics: ["deploy", "change set", "SFDX", "metadata API", "package"] }, { query: "OAuth 2.0 JWT bearer flow for Salesforce", context: "Developer implementing server-to-server auth", expectedTopics: ["JWT", "connected app", "certificate", "OAuth"] }, { query: "platform events vs change data capture", context: "Architect choosing event architecture", expectedTopics: ["platform events", "CDC", "streaming", "event-driven"] } ]; interface LLMJudgeResult { query: string; topResultTitle: string; topResultSnippet: string; llmScore: number; llmReasoning: string; passed: boolean; timestamp: string; } interface LLMResponse { score: number; reasoning: string; } // Prompt template for LLM evaluation function buildJudgePrompt(query: string, context: string, resultTitle: string, resultSnippet: string, expectedTopics: string[]): string { return `You are evaluating a search engine for Salesforce documentation. Your task is to score how relevant the search result is to the user's query. USER QUERY: "${query}" CONTEXT: ${context} SEARCH RESULT: Title: ${resultTitle} Content: ${resultSnippet.substring(0, 1000)}... EXPECTED TOPICS (for reference): ${expectedTopics.join(", ")} SCORING CRITERIA: 5 - Perfect: Directly answers the query with authoritative, comprehensive information 4 - Good: Relevant information that helps answer the query 3 - Acceptable: Somewhat relevant but missing key information or not the best match 2 - Poor: Tangentially related but doesn't really help 1 - Irrelevant: Completely unrelated to the query Please respond in JSON format: { "score": <number 1-5>, "reasoning": "<brief explanation of your score>" }`; } // Call LLM API (placeholder - implement based on your LLM provider) async function callLLMJudge(prompt: string): Promise<LLMResponse | null> { const apiKey = process.env.OPENAI_API_KEY || process.env.AZURE_OPENAI_API_KEY; if (!apiKey) { console.warn("⚠️ No LLM API key found. Set OPENAI_API_KEY or AZURE_OPENAI_API_KEY"); return null; } try { const response = await fetch(LLM_JUDGE_CONFIG.apiEndpoint, { method: 'POST', headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${apiKey}` }, body: JSON.stringify({ model: LLM_JUDGE_CONFIG.model, messages: [ { role: "system", content: "You are a search quality evaluator. Always respond with valid JSON." }, { role: "user", content: prompt } ], temperature: 0.1, max_tokens: 500 }) }); if (!response.ok) { console.error(`LLM API error: ${response.status} ${response.statusText}`); return null; } const data = await response.json() as { choices: Array<{ message: { content: string } }> }; const content = data.choices?.[0]?.message?.content; if (!content) { console.error("No content in LLM response"); return null; } // Parse JSON from response (handle markdown code blocks) const jsonMatch = content.match(/\{[\s\S]*\}/); if (!jsonMatch) { console.error("Could not parse JSON from LLM response:", content); return null; } return JSON.parse(jsonMatch[0]) as LLMResponse; } catch (error) { console.error("Error calling LLM:", error); return null; } } // Simulated LLM judge for testing without API key function simulateLLMJudge(query: string, resultTitle: string, resultSnippet: string, expectedTopics: string[]): LLMResponse { // Simple heuristic: check how many expected topics appear in the result const content = (resultTitle + " " + resultSnippet).toLowerCase(); let topicMatches = 0; for (const topic of expectedTopics) { if (content.includes(topic.toLowerCase())) { topicMatches++; } } const matchRatio = topicMatches / expectedTopics.length; let score: number; let reasoning: string; if (matchRatio >= 0.6) { score = 4 + Math.random() * 0.5; reasoning = `Good match: ${topicMatches}/${expectedTopics.length} expected topics found in result`; } else if (matchRatio >= 0.3) { score = 3 + Math.random() * 0.5; reasoning = `Partial match: ${topicMatches}/${expectedTopics.length} expected topics found`; } else if (matchRatio > 0) { score = 2 + Math.random() * 0.5; reasoning = `Weak match: Only ${topicMatches}/${expectedTopics.length} expected topics found`; } else { score = 1 + Math.random() * 0.5; reasoning = `Poor match: No expected topics found in result`; } return { score: Math.min(5, Math.round(score * 10) / 10), reasoning }; } async function runLLMJudge(): Promise<void> { console.log("=".repeat(80)); console.log("🧠 LLM-as-Judge Search Quality Evaluation (Phase 3.4)"); console.log("=".repeat(80)); // Initialize database await initializeDatabase(); if (!isDatabaseIndexed()) { console.error("\nError: Database is not indexed!"); process.exit(1); } const apiKey = process.env.OPENAI_API_KEY || process.env.AZURE_OPENAI_API_KEY; const useSimulation = !apiKey; if (useSimulation) { console.log("\n⚠️ No API key found. Using simulated LLM judge (heuristic-based)."); console.log(" Set OPENAI_API_KEY or AZURE_OPENAI_API_KEY for real LLM evaluation.\n"); } else { console.log(`\n🤖 Using model: ${LLM_JUDGE_CONFIG.model}`); console.log(` Endpoint: ${LLM_JUDGE_CONFIG.apiEndpoint}\n`); } const results: LLMJudgeResult[] = []; let totalScore = 0; let passedCount = 0; for (const testCase of LLM_JUDGE_CASES) { console.log(`\n📋 Query: "${testCase.query}"`); console.log(` Context: ${testCase.context}`); console.log("-".repeat(60)); try { // Search for results const searchResults = await searchDocuments(testCase.query, { maxResults: 1 }); if (searchResults.length === 0) { console.log(" ❌ No search results returned"); results.push({ query: testCase.query, topResultTitle: "NO RESULTS", topResultSnippet: "", llmScore: 0, llmReasoning: "No search results returned", passed: false, timestamp: new Date().toISOString() }); continue; } const topResult = searchResults[0]; const resultTitle = topResult.document.title; const resultSnippet = topResult.chunk; console.log(` Top result: ${resultTitle}`); console.log(` Score: ${topResult.score.toFixed(2)}`); // Get LLM judgment let llmResponse: LLMResponse | null; if (useSimulation) { llmResponse = simulateLLMJudge( testCase.query, resultTitle, resultSnippet, testCase.expectedTopics ); } else { const prompt = buildJudgePrompt( testCase.query, testCase.context, resultTitle, resultSnippet, testCase.expectedTopics ); llmResponse = await callLLMJudge(prompt); // Rate limiting await new Promise(resolve => setTimeout(resolve, LLM_JUDGE_CONFIG.delayBetweenCalls)); } if (llmResponse) { const passed = llmResponse.score >= LLM_JUDGE_CONFIG.minRelevanceScore; totalScore += llmResponse.score; if (passed) passedCount++; const statusIcon = passed ? '✅' : '❌'; console.log(` ${statusIcon} LLM Score: ${llmResponse.score}/5`); console.log(` Reasoning: ${llmResponse.reasoning}`); results.push({ query: testCase.query, topResultTitle: resultTitle, topResultSnippet: resultSnippet.substring(0, 500), llmScore: llmResponse.score, llmReasoning: llmResponse.reasoning, passed, timestamp: new Date().toISOString() }); } else { console.log(" ⚠️ Could not get LLM judgment"); results.push({ query: testCase.query, topResultTitle: resultTitle, topResultSnippet: resultSnippet.substring(0, 500), llmScore: 0, llmReasoning: "LLM evaluation failed", passed: false, timestamp: new Date().toISOString() }); } } catch (error) { console.log(` ❌ Error: ${error}`); results.push({ query: testCase.query, topResultTitle: "ERROR", topResultSnippet: "", llmScore: 0, llmReasoning: String(error), passed: false, timestamp: new Date().toISOString() }); } } // Summary console.log("\n\n" + "=".repeat(80)); console.log("📊 LLM JUDGE SUMMARY"); console.log("=".repeat(80)); const avgScore = results.length > 0 ? totalScore / results.filter(r => r.llmScore > 0).length : 0; const passRate = results.length > 0 ? passedCount / results.length : 0; console.log(`\nTotal Queries Evaluated: ${results.length}`); console.log(`Average LLM Score: ${avgScore.toFixed(2)}/5`); console.log(`Pass Rate (≥${LLM_JUDGE_CONFIG.minRelevanceScore}): ${passedCount}/${results.length} (${(passRate * 100).toFixed(1)}%)`); // Failed queries const failedResults = results.filter(r => !r.passed); if (failedResults.length > 0) { console.log("\n❌ Queries needing attention:"); for (const result of failedResults) { console.log(` - "${result.query}"`); console.log(` Score: ${result.llmScore}/5 - ${result.llmReasoning}`); } } // Save results try { const dir = path.dirname(LLM_JUDGE_CONFIG.resultsFile); if (!fs.existsSync(dir)) { fs.mkdirSync(dir, { recursive: true }); } const historicalResults = { runTimestamp: new Date().toISOString(), mode: useSimulation ? 'simulated' : 'live', model: useSimulation ? 'heuristic' : LLM_JUDGE_CONFIG.model, summary: { totalQueries: results.length, avgScore, passRate, passedCount, failedCount: failedResults.length }, results }; fs.writeFileSync(LLM_JUDGE_CONFIG.resultsFile, JSON.stringify(historicalResults, null, 2)); console.log(`\n💾 Results saved to ${LLM_JUDGE_CONFIG.resultsFile}`); } catch (error) { console.error(`\n❌ Could not save results: ${error}`); } console.log("\n" + "=".repeat(80)); if (passRate >= 0.8) { console.log("✅ Search quality meets LLM judge standards!"); } else { console.log("⚠️ Search quality needs improvement based on LLM evaluation"); process.exit(1); } console.log("=".repeat(80)); } runLLMJudge().catch(console.error);

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/SalesforceDiariesBySanket/salesforce-docs-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test-llm-judge.ts•15.1 KiB