ThoughtMCP

calibration-learning.test.ts•23.2 kB

/** * Tests for Calibration Learning Engine * * Tests prediction-outcome tracking, calibratirror calculation, * domain-specific model training, and accuracy improvement over time. */ import { beforeEach, describe, expect, it } from "vitest"; import { CalibrationLearningEngine } from "../../../confidence/calibration-learning-engine"; describe("CalibrationLearningEngine", () => { let engine: CalibrationLearningEngine; beforeEach(() => { engine = new CalibrationLearningEngine(); }); describe("Prediction-Outcome Pair Storage", () => { it("should store prediction-outcome pair with all required fields", () => { engine.trackPredictionOutcome(0.8, 1.0, "general"); const pairs = engine.getPredictionOutcomes("general"); expect(pairs).toHaveLength(1); expect(pairs[0].predictedConfidence).toBe(0.8); expect(pairs[0].actualOutcome).toBe(1.0); expect(pairs[0].domain).toBe("general"); expect(pairs[0].timestamp).toBeInstanceOf(Date); }); it("should retrieve pairs by domain", () => { engine.trackPredictionOutcome(0.7, 1.0, "technical"); engine.trackPredictionOutcome(0.6, 0.0, "creative"); engine.trackPredictionOutcome(0.8, 1.0, "technical"); const technicalPairs = engine.getPredictionOutcomes("technical"); const creativePairs = engine.getPredictionOutcomes("creative"); expect(technicalPairs).toHaveLength(2); expect(creativePairs).toHaveLength(1); expect(technicalPairs[0].domain).toBe("technical"); expect(creativePairs[0].domain).toBe("creative"); }); it("should retrieve pairs by time range", () => { const now = new Date(); const oneHourAgo = new Date(now.getTime() - 3600000); engine.trackPredictionOutcome(0.7, 1.0, "general"); const recentPairs = engine.getPredictionOutcomesByTimeRange("general", oneHourAgo, now); expect(recentPairs.length).toBeGreaterThan(0); expect(recentPairs[0].timestamp.getTime()).toBeGreaterThanOrEqual(oneHourAgo.getTime()); }); it("should reject negative confidence values", () => { expect(() => { engine.trackPredictionOutcome(-0.1, 1.0, "general"); }).toThrow("Confidence must be between 0 and 1"); }); it("should reject confidence values greater than 1", () => { expect(() => { engine.trackPredictionOutcome(1.5, 1.0, "general"); }).toThrow("Confidence must be between 0 and 1"); }); it("should reject invalid outcome values", () => { expect(() => { engine.trackPredictionOutcome(0.8, -0.5, "general"); }).toThrow("Outcome must be between 0 and 1"); expect(() => { engine.trackPredictionOutcome(0.8, 1.5, "general"); }).toThrow("Outcome must be between 0 and 1"); }); it("should handle empty domain gracefully", () => { const pairs = engine.getPredictionOutcomes("nonexistent"); expect(pairs).toHaveLength(0); }); }); describe("Calibration Error Calculation", () => { it("should calculate mean absolute error correctly", () => { // Perfect predictions engine.trackPredictionOutcome(0.8, 0.8, "perfect"); engine.trackPredictionOutcome(0.6, 0.6, "perfect"); engine.trackPredictionOutcome(0.4, 0.4, "perfect"); const error = engine.calculateCalibrationError("perfect"); expect(error).toBe(0); }); it("should calculate error for imperfect predictions", () => { // Predictions off by 0.2 each engine.trackPredictionOutcome(0.8, 0.6, "imperfect"); engine.trackPredictionOutcome(0.6, 0.8, "imperfect"); engine.trackPredictionOutcome(0.4, 0.6, "imperfect"); const error = engine.calculateCalibrationError("imperfect"); expect(error).toBeCloseTo(0.2, 2); }); it("should calculate error by confidence range", () => { // Low confidence range (0-0.3) engine.trackPredictionOutcome(0.2, 0.5, "ranges"); // Medium confidence range (0.3-0.7) engine.trackPredictionOutcome(0.5, 0.8, "ranges"); // High confidence range (0.7-1.0) engine.trackPredictionOutcome(0.9, 0.7, "ranges"); const errorByRange = engine.calculateCalibrationErrorByRange("ranges"); expect(errorByRange).toHaveProperty("low"); expect(errorByRange).toHaveProperty("medium"); expect(errorByRange).toHaveProperty("high"); expect(errorByRange.low).toBeCloseTo(0.3, 2); expect(errorByRange.medium).toBeCloseTo(0.3, 2); expect(errorByRange.high).toBeCloseTo(0.2, 2); }); it("should handle edge case with no data", () => { const error = engine.calculateCalibrationError("empty"); expect(error).toBe(0); }); it("should handle edge case with single data point", () => { engine.trackPredictionOutcome(0.7, 0.9, "single"); const error = engine.calculateCalibrationError("single"); expect(error).toBeCloseTo(0.2, 2); }); it("should calculate error for poor calibration", () => { // Consistently overconfident for (let i = 0; i < 10; i++) { engine.trackPredictionOutcome(0.9, 0.3, "poor"); } const error = engine.calculateCalibrationError("poor"); expect(error).toBeGreaterThan(0.5); }); }); describe("Domain-Specific Model Training", () => { it("should train model with sufficient data (1000+ pairs)", () => { // Generate 1000 pairs with systematic bias (overconfident by 0.2) for (let i = 0; i < 1000; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.2); engine.trackPredictionOutcome(predicted, actual, "sufficient"); } const model = engine.trainCalibrationModel("sufficient"); expect(model).toBeDefined(); expect(model.domain).toBe("sufficient"); expect(model.sampleSize).toBe(1000); expect(model.slope).toBeGreaterThan(0); expect(model.intercept).toBeDefined(); expect(model.calibrationError).toBeGreaterThan(0); expect(model.lastUpdated).toBeInstanceOf(Date); }); it("should not train model with insufficient data", () => { // Only 500 pairs for (let i = 0; i < 500; i++) { engine.trackPredictionOutcome(0.7, 0.7, "insufficient"); } expect(() => { engine.trainCalibrationModel("insufficient"); }).toThrow("Insufficient data for training"); }); it("should produce model that reduces calibration error", () => { // Generate biased data for (let i = 0; i < 1000; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.15); engine.trackPredictionOutcome(predicted, actual, "improvement"); } const errorBefore = engine.calculateCalibrationError("improvement"); const model = engine.trainCalibrationModel("improvement"); // Apply model and check if error would be reduced expect(model.calibrationError).toBeLessThan(errorBefore); }); it("should support multiple domain-specific models", () => { // Train model for domain A for (let i = 0; i < 1000; i++) { engine.trackPredictionOutcome(0.8, 0.6, "domainA"); } const modelA = engine.trainCalibrationModel("domainA"); // Train model for domain B for (let i = 0; i < 1000; i++) { engine.trackPredictionOutcome(0.4, 0.6, "domainB"); } const modelB = engine.trainCalibrationModel("domainB"); expect(modelA.domain).toBe("domainA"); expect(modelB.domain).toBe("domainB"); expect(modelA.slope).not.toBe(modelB.slope); }); it("should retrieve trained model", () => { for (let i = 0; i < 1000; i++) { engine.trackPredictionOutcome(0.7, 0.7, "retrieve"); } engine.trainCalibrationModel("retrieve"); const model = engine.getCalibrationModel("retrieve"); expect(model).toBeDefined(); expect(model?.domain).toBe("retrieve"); }); it("should return undefined for untrained domain", () => { const model = engine.getCalibrationModel("untrained"); expect(model).toBeUndefined(); }); }); describe("Calibration Accuracy (±10%)", () => { it("should achieve predictions within ±10% of actual outcomes", () => { // Generate well-calibrated data for (let i = 0; i < 1000; i++) { const actual = Math.random(); const predicted = actual + (Math.random() - 0.5) * 0.1; // ±5% noise engine.trackPredictionOutcome(Math.max(0, Math.min(1, predicted)), actual, "accurate"); } const model = engine.trainCalibrationModel("accurate"); expect(model.calibrationError).toBeLessThanOrEqual(0.1); }); it("should improve accuracy after model training", () => { // Generate biased data for (let i = 0; i < 1000; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.25); engine.trackPredictionOutcome(predicted, actual, "improve"); } const errorBefore = engine.calculateCalibrationError("improve"); engine.trainCalibrationModel("improve"); // Simulate applying calibration const model = engine.getCalibrationModel("improve"); expect(model).toBeDefined(); expect(model!.calibrationError).toBeLessThan(errorBefore); }); it("should maintain accuracy across low confidence range", () => { // Low confidence predictions (0-0.3) for (let i = 0; i < 1000; i++) { const actual = Math.random() * 0.3; const predicted = actual + (Math.random() - 0.5) * 0.05; engine.trackPredictionOutcome(Math.max(0, Math.min(0.3, predicted)), actual, "low-range"); } engine.trainCalibrationModel("low-range"); const errorByRange = engine.calculateCalibrationErrorByRange("low-range"); expect(errorByRange.low).toBeLessThanOrEqual(0.1); }); it("should maintain accuracy across medium confidence range", () => { // Medium confidence predictions (0.3-0.7) for (let i = 0; i < 1000; i++) { const actual = 0.3 + Math.random() * 0.4; const predicted = actual + (Math.random() - 0.5) * 0.05; engine.trackPredictionOutcome( Math.max(0.3, Math.min(0.7, predicted)), actual, "medium-range" ); } engine.trainCalibrationModel("medium-range"); const errorByRange = engine.calculateCalibrationErrorByRange("medium-range"); expect(errorByRange.medium).toBeLessThanOrEqual(0.1); }); it("should maintain accuracy across high confidence range", () => { // High confidence predictions (0.7-1.0) for (let i = 0; i < 1000; i++) { const actual = 0.7 + Math.random() * 0.3; const predicted = actual + (Math.random() - 0.5) * 0.05; engine.trackPredictionOutcome( Math.max(0.7, Math.min(1.0, predicted)), actual, "high-range" ); } engine.trainCalibrationModel("high-range"); const errorByRange = engine.calculateCalibrationErrorByRange("high-range"); expect(errorByRange.high).toBeLessThanOrEqual(0.1); }); it("should maintain accuracy for different domains", () => { const domains = ["technical", "creative", "analytical"]; for (const domain of domains) { for (let i = 0; i < 1000; i++) { const actual = Math.random(); const predicted = actual + (Math.random() - 0.5) * 0.08; engine.trackPredictionOutcome(Math.max(0, Math.min(1, predicted)), actual, domain); } const model = engine.trainCalibrationModel(domain); expect(model.calibrationError).toBeLessThanOrEqual(0.1); } }); }); describe("Improvement Over Time", () => { it("should show calibration error decreases with more data", () => { const errors: number[] = []; // Add data in batches and track error for (let batch = 0; batch < 5; batch++) { for (let i = 0; i < 200; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.2); engine.trackPredictionOutcome(predicted, actual, "learning"); } if (batch >= 4) { // After 1000 samples const error = engine.calculateCalibrationError("learning"); errors.push(error); } } // Error should stabilize or decrease expect(errors[errors.length - 1]).toBeLessThanOrEqual(errors[0] * 1.1); }); it("should improve accuracy by at least 15% after 1000 pairs", () => { // First 100 pairs - establish baseline for (let i = 0; i < 100; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.3); engine.trackPredictionOutcome(predicted, actual, "improvement-15"); } const baselineError = engine.calculateCalibrationError("improvement-15"); // Add 900 more pairs for (let i = 0; i < 900; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.3); engine.trackPredictionOutcome(predicted, actual, "improvement-15"); } // Train model const model = engine.trainCalibrationModel("improvement-15"); // Improvement should be at least 15% const improvement = (baselineError - model.calibrationError) / baselineError; expect(improvement).toBeGreaterThanOrEqual(0.15); }); it("should track improvement metrics over time", () => { // Add data progressively for (let i = 0; i < 1500; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.2); engine.trackPredictionOutcome(predicted, actual, "metrics"); } engine.trainCalibrationModel("metrics"); const metrics = engine.getImprovementMetrics("metrics"); expect(metrics).toBeDefined(); expect(metrics.sampleCount).toBe(1500); expect(metrics.currentError).toBeGreaterThan(0); expect(metrics.improvementRate).toBeDefined(); }); it("should show learning curve with error reduction", () => { const learningCurve: Array<{ samples: number; error: number }> = []; // Collect error at different sample sizes for (let samples = 200; samples <= 1200; samples += 200) { const tempEngine = new CalibrationLearningEngine(); for (let i = 0; i < samples; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.25); tempEngine.trackPredictionOutcome(predicted, actual, "curve"); } if (samples >= 1000) { tempEngine.trainCalibrationModel("curve"); const model = tempEngine.getCalibrationModel("curve"); learningCurve.push({ samples, error: model!.calibrationError }); } else { const error = tempEngine.calculateCalibrationError("curve"); learningCurve.push({ samples, error }); } } // Error should generally decrease or stabilize expect(learningCurve[learningCurve.length - 1].error).toBeLessThanOrEqual( learningCurve[0].error * 1.2 ); }); }); describe("Bias Identification", () => { it("should identify overconfidence bias", () => { // Consistently predict higher than actual for (let i = 0; i < 1000; i++) { const actual = Math.random() * 0.7; const predicted = actual + 0.25; engine.trackPredictionOutcome(predicted, actual, "overconfident"); } const biases = engine.identifyCalibrationBiases("overconfident"); expect(biases).toBeDefined(); expect(biases.length).toBeGreaterThan(0); expect(biases.some((b) => b.type === "overconfidence")).toBe(true); }); it("should identify underconfidence bias", () => { // Consistently predict lower than actual for (let i = 0; i < 1000; i++) { const actual = 0.3 + Math.random() * 0.7; const predicted = actual - 0.25; engine.trackPredictionOutcome(Math.max(0, predicted), actual, "underconfident"); } const biases = engine.identifyCalibrationBiases("underconfident"); expect(biases).toBeDefined(); expect(biases.some((b) => b.type === "underconfidence")).toBe(true); }); it("should identify confidence range with high error", () => { // Create data with high error in medium confidence range for (let i = 0; i < 1000; i++) { const actual = Math.random(); let predicted: number; if (actual >= 0.3 && actual <= 0.7) { // Add large systematic error in medium range (overconfident) predicted = Math.min(1.0, actual + 0.25); } else { predicted = Math.min(1.0, Math.max(0, actual + (Math.random() - 0.5) * 0.05)); } engine.trackPredictionOutcome(predicted, actual, "range-error"); } const errorByRange = engine.calculateCalibrationErrorByRange("range-error"); const biases = engine.identifyCalibrationBiases("range-error"); expect(biases).toBeDefined(); // Medium range should have significantly higher error expect(errorByRange.medium).toBeGreaterThan(0.15); // Should detect at least one bias expect(biases.length).toBeGreaterThan(0); }); it("should generate correction factors for biases", () => { for (let i = 0; i < 1000; i++) { const actual = Math.random(); const predicted = Math.min(1.0, actual + 0.2); engine.trackPredictionOutcome(predicted, actual, "correction"); } const biases = engine.identifyCalibrationBiases("correction"); expect(biases.length).toBeGreaterThan(0); biases.forEach((bias) => { expect(bias.correctionFactor).toBeDefined(); expect(typeof bias.correctionFactor).toBe("number"); }); }); }); describe("Factor Weight Adjustment", () => { it("should return default weights when no biases provided", () => { const weights = engine.adjustFactorWeights([]); expect(weights).toBeDefined(); expect(weights.evidence).toBeCloseTo(0.3, 2); expect(weights.coherence).toBeCloseTo(0.3, 2); expect(weights.completeness).toBeCloseTo(0.25, 2); expect(weights.uncertainty).toBeCloseTo(0.15, 2); }); it("should adjust weights for overconfidence bias", () => { const biases = [ { type: "overconfidence" as const, magnitude: 0.2, confidenceRange: [0, 1] as [number, number], correctionFactor: 0.9, }, ]; const weights = engine.adjustFactorWeights(biases); // Overconfidence should increase uncertainty weight and decrease evidence weight expect(weights.uncertainty).toBeGreaterThan(0.15); expect(weights.evidence).toBeLessThan(0.3); }); it("should adjust weights for underconfidence bias", () => { const biases = [ { type: "underconfidence" as const, magnitude: 0.2, confidenceRange: [0, 1] as [number, number], correctionFactor: 1.1, }, ]; const weights = engine.adjustFactorWeights(biases); // Underconfidence should decrease uncertainty weight and increase evidence weight expect(weights.uncertainty).toBeLessThan(0.15); expect(weights.evidence).toBeGreaterThan(0.3); }); it("should normalize weights to sum to 1.0", () => { const biases = [ { type: "overconfidence" as const, magnitude: 0.3, confidenceRange: [0, 1] as [number, number], correctionFactor: 0.85, }, ]; const weights = engine.adjustFactorWeights(biases); const sum = weights.evidence + weights.coherence + weights.completeness + weights.uncertainty; expect(sum).toBeCloseTo(1.0, 5); }); it("should handle multiple biases", () => { const biases = [ { type: "overconfidence" as const, magnitude: 0.15, confidenceRange: [0, 1] as [number, number], correctionFactor: 0.925, }, { type: "underconfidence" as const, magnitude: 0.1, confidenceRange: [0, 1] as [number, number], correctionFactor: 1.05, }, ]; const weights = engine.adjustFactorWeights(biases); // Should apply both adjustments expect(weights).toBeDefined(); const sum = weights.evidence + weights.coherence + weights.completeness + weights.uncertainty; expect(sum).toBeCloseTo(1.0, 5); }); it("should respect weight bounds", () => { const biases = [ { type: "overconfidence" as const, magnitude: 0.5, confidenceRange: [0, 1] as [number, number], correctionFactor: 0.75, }, ]; const weights = engine.adjustFactorWeights(biases); // Weights should stay within reasonable bounds expect(weights.uncertainty).toBeLessThanOrEqual(0.25); expect(weights.evidence).toBeGreaterThanOrEqual(0.2); }); it("should handle range-specific biases", () => { const biases = [ { type: "low-range-error" as const, magnitude: 0.2, confidenceRange: [0, 0.3] as [number, number], correctionFactor: 0.94, }, { type: "medium-range-error" as const, magnitude: 0.18, confidenceRange: [0.3, 0.7] as [number, number], correctionFactor: 0.946, }, { type: "high-range-error" as const, magnitude: 0.16, confidenceRange: [0.7, 1.0] as [number, number], correctionFactor: 0.952, }, ]; const weights = engine.adjustFactorWeights(biases); // Should handle range-specific biases without crashing expect(weights).toBeDefined(); const sum = weights.evidence + weights.coherence + weights.completeness + weights.uncertainty; expect(sum).toBeCloseTo(1.0, 5); }); }); describe("Performance", () => { it("should track prediction-outcome in < 1ms", () => { const start = performance.now(); for (let i = 0; i < 100; i++) { engine.trackPredictionOutcome(0.7, 0.8, "performance"); } const duration = performance.now() - start; const avgTime = duration / 100; expect(avgTime).toBeLessThan(1); }); it("should calculate error for 1000 pairs in < 10ms", () => { for (let i = 0; i < 1000; i++) { engine.trackPredictionOutcome(Math.random(), Math.random(), "perf-error"); } const start = performance.now(); engine.calculateCalibrationError("perf-error"); const duration = performance.now() - start; expect(duration).toBeLessThan(10); }); it("should train model in < 100ms for 1000 pairs", () => { for (let i = 0; i < 1000; i++) { engine.trackPredictionOutcome(Math.random(), Math.random(), "perf-train"); } const start = performance.now(); engine.trainCalibrationModel("perf-train"); const duration = performance.now() - start; expect(duration).toBeLessThan(100); }); }); });

Latest Blog Posts

What Is Context Bloat in MCP?
By Om-Shree-0709 on December 16, 2025.
mcp
Context Bloat
MCP Moves to the Linux Foundation: Neutral Stewardship for Agentic Infrastructure
By Om-Shree-0709 on December 15, 2025.
mcp
anthropic
Linux Foundation
Code Execution with MCP: Architecting Agentic Efficiency
By Om-Shree-0709 on December 14, 2025.
mcp
Token bloat

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/keyurgolani/ThoughtMcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server