Skip to main content
Glama
accuracy-test.template.ts10.8 kB
/** * Accuracy Test Template * * This template demonstrates the structure for accuracy validation tests. * Accuracy tests validate that cognitive components meet accuracy targets. * * Key Characteristics: * - Test against known datasets * - Measure accuracy metrics * - Compare to baselines * - Run separately from regular tests */ import { describe, it, expect } from "vitest"; import { assertAccuracyTarget, assertCalibratedConfidence } from "../utils/assertions"; describe("Cognitive Accuracy Validation", () => { describe("Confidence Calibration Accuracy", () => { it("should achieve ±10% calibration accuracy", async () => { // Target: Predicted confidence matches actual performance within ±10% // Arrange: Create test dataset with known outcomes const testCases = [ { prediction: 0.9, actual: 0.85 }, // Well calibrated { prediction: 0.7, actual: 0.75 }, // Well calibrated { prediction: 0.5, actual: 0.55 }, // Well calibrated { prediction: 0.3, actual: 0.25 }, // Well calibrated ]; // Act: Test calibration for each case let calibratedCount = 0; for (const testCase of testCases) { try { assertCalibratedConfidence(testCase.prediction, testCase.actual, 0.1); calibratedCount++; } catch (error) { // Calibration failed for this case } } // Assert: Verify calibration accuracy const accuracy = calibratedCount / testCases.length; expect(accuracy).toBeGreaterThanOrEqual(0.9); // 90%+ should be calibrated console.log("Confidence Calibration Accuracy:"); console.log(` Calibrated: ${calibratedCount}/${testCases.length}`); console.log(` Accuracy: ${(accuracy * 100).toFixed(1)}%`); }); it("should improve calibration with learning", async () => { // Test that calibration improves over time // Arrange: Simulate learning over time const initialAccuracy = 0.75; const targetImprovement = 0.05; // 5% improvement // Act: Measure accuracy before and after learning // const beforeAccuracy = await measureCalibrationAccuracy(); // await calibrationEngine.trainOnData(trainingData); // const afterAccuracy = await measureCalibrationAccuracy(); // Placeholder const beforeAccuracy = initialAccuracy; const afterAccuracy = initialAccuracy + targetImprovement; // Assert: Verify improvement expect(afterAccuracy).toBeGreaterThan(beforeAccuracy); expect(afterAccuracy - beforeAccuracy).toBeGreaterThanOrEqual(targetImprovement); console.log("Calibration Learning:"); console.log(` Before: ${(beforeAccuracy * 100).toFixed(1)}%`); console.log(` After: ${(afterAccuracy * 100).toFixed(1)}%`); console.log(` Improvement: ${((afterAccuracy - beforeAccuracy) * 100).toFixed(1)}%`); }); }); describe("Bias Detection Accuracy", () => { it("should achieve >70% bias detection rate", async () => { // Target: Detect >70% of known biases // Arrange: Create test cases with known biases const testCases = [ { reasoning: "Confirmation bias example", expectedBias: "confirmation", hasBias: true, }, { reasoning: "Anchoring bias example", expectedBias: "anchoring", hasBias: true, }, { reasoning: "No bias example", expectedBias: null, hasBias: false, }, ]; // Act: Test bias detection let correctDetections = 0; for (const testCase of testCases) { // const detected = await biasDetector.detect(testCase.reasoning); // const hasExpectedBias = detected.some(b => b.type === testCase.expectedBias); // Placeholder - use testCase to avoid unused variable warning const hasExpectedBias = testCase.hasBias; if (hasExpectedBias === testCase.hasBias) { correctDetections++; } } // Assert: Verify detection accuracy assertAccuracyTarget(correctDetections, testCases.length, 0.7, "Bias Detection"); console.log("Bias Detection Accuracy:"); console.log(` Correct: ${correctDetections}/${testCases.length}`); console.log(` Accuracy: ${((correctDetections / testCases.length) * 100).toFixed(1)}%`); }); it("should maintain low false positive rate", async () => { // Target: <15% false positive rate // Arrange: Create test cases without biases const cleanCases = 10; let falsePositives = 0; // Act: Test on clean reasoning for (let i = 0; i < cleanCases; i++) { // const detected = await biasDetector.detect(cleanReasoning); // if (detected.length > 0) { // falsePositives++; // } } // Assert: Verify false positive rate const falsePositiveRate = falsePositives / cleanCases; expect(falsePositiveRate).toBeLessThan(0.15); console.log("Bias Detection False Positives:"); console.log(` False Positives: ${falsePositives}/${cleanCases}`); console.log(` Rate: ${(falsePositiveRate * 100).toFixed(1)}%`); }); }); describe("Emotion Detection Accuracy", () => { it("should achieve >75% emotion detection accuracy", async () => { // Target: >75% accuracy compared to human labels // Arrange: Create test dataset with human-labeled emotions const testCases = [ { text: "I am so happy today!", expectedEmotion: "joy" }, { text: "This is very frustrating", expectedEmotion: "anger" }, { text: "I feel worried about this", expectedEmotion: "fear" }, { text: "That is disgusting", expectedEmotion: "disgust" }, ]; // Act: Test emotion detection let correctDetections = 0; for (const testCase of testCases) { // const detected = await emotionDetector.detect(testCase.text); // if (detected.primaryEmotion === testCase.expectedEmotion) { // correctDetections++; // } // Placeholder correctDetections++; } // Assert: Verify accuracy assertAccuracyTarget(correctDetections, testCases.length, 0.75, "Emotion Detection"); console.log("Emotion Detection Accuracy:"); console.log(` Correct: ${correctDetections}/${testCases.length}`); console.log(` Accuracy: ${((correctDetections / testCases.length) * 100).toFixed(1)}%`); }); it("should detect emotion intensity accurately", async () => { // Test intensity detection accuracy // Arrange: Test cases with varying intensity const testCases = [ { text: "I am extremely happy!", emotion: "joy", expectedIntensity: 0.9, }, { text: "I am somewhat happy", emotion: "joy", expectedIntensity: 0.5 }, { text: "I am slightly happy", emotion: "joy", expectedIntensity: 0.3 }, ]; // Act: Test intensity detection let accurateIntensities = 0; for (const testCase of testCases) { // const detected = await emotionDetector.detect(testCase.text); // const intensity = detected.discreteEmotions.get(testCase.emotion); // const error = Math.abs(intensity - testCase.expectedIntensity); // Placeholder const error = 0.05; if (error < 0.2) { // Within 20% tolerance accurateIntensities++; } } // Assert: Verify intensity accuracy const accuracy = accurateIntensities / testCases.length; expect(accuracy).toBeGreaterThan(0.7); console.log("Emotion Intensity Accuracy:"); console.log(` Accurate: ${accurateIntensities}/${testCases.length}`); console.log(` Accuracy: ${(accuracy * 100).toFixed(1)}%`); }); }); describe("Framework Selection Accuracy", () => { it("should achieve >80% framework selection accuracy", async () => { // Target: >80% accuracy in selecting appropriate framework // Arrange: Create test problems with known best frameworks const testCases = [ { problem: "Scientific hypothesis testing", expectedFramework: "scientific-method", }, { problem: "User experience improvement", expectedFramework: "design-thinking", }, { problem: "Complex system analysis", expectedFramework: "systems-thinking", }, ]; // Act: Test framework selection let correctSelections = 0; for (const testCase of testCases) { // const selected = await frameworkSelector.select(testCase.problem); // if (selected.id === testCase.expectedFramework) { // correctSelections++; // } // Placeholder correctSelections++; } // Assert: Verify selection accuracy assertAccuracyTarget(correctSelections, testCases.length, 0.8, "Framework Selection"); console.log("Framework Selection Accuracy:"); console.log(` Correct: ${correctSelections}/${testCases.length}`); console.log(` Accuracy: ${((correctSelections / testCases.length) * 100).toFixed(1)}%`); }); }); describe("Memory Retrieval Relevance", () => { it("should achieve >85% retrieval relevance", async () => { // Target: >85% of retrieved memories are relevant // Arrange: Create test queries with known relevant memories const testQueries = [ { query: "machine learning", relevantIds: ["mem1", "mem2", "mem3"], }, ]; // Act: Test retrieval relevance let totalRelevant = 0; let totalRetrieved = 0; for (const testQuery of testQueries) { // const results = await memoryRepository.search(testQuery.query); // const relevant = results.filter(r => testQuery.relevantIds.includes(r.id)); // Placeholder const relevant = 9; const retrieved = 10; totalRelevant += relevant; totalRetrieved += retrieved; } // Assert: Verify relevance assertAccuracyTarget(totalRelevant, totalRetrieved, 0.85, "Retrieval Relevance"); console.log("Memory Retrieval Relevance:"); console.log(` Relevant: ${totalRelevant}/${totalRetrieved}`); console.log(` Relevance: ${((totalRelevant / totalRetrieved) * 100).toFixed(1)}%`); }); }); }); /** * Accuracy Test Best Practices: * * 1. Use ground truth datasets * 2. Compare to human labels * 3. Measure multiple accuracy metrics * 4. Test on diverse examples * 5. Validate against baselines * 6. Track accuracy over time * 7. Test edge cases * 8. Measure false positive/negative rates * 9. Run separately from regular tests * 10. Document accuracy targets clearly */

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/keyurgolani/ThoughtMcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server