Thoughtbox

contamination.test.ts•15.9 KiB

#!/usr/bin/env npx tsx /** * Unit tests for Contamination Detection * SPEC: SIL-009 * * Run with: npx tsx tests/unit/contamination.test.ts */ import { ContaminationDetector, resetContaminationDetector, DEFAULT_CONTAMINATION_CONFIG, EXPECTED_SOLVE_TIMES, type TestCase, type KnownSolution, } from "../../benchmarks/contamination.js"; // ============================================================================= // Test Utilities // ============================================================================= let testsPassed = 0; let testsFailed = 0; let currentTest = ""; function test(name: string, fn: () => void | Promise<void>): void { currentTest = name; Promise.resolve(fn()) .then(() => { testsPassed++; console.log(` \u2713 ${name}`); }) .catch((err) => { testsFailed++; console.error(` \u2717 ${name}`); console.error(` Error: ${err.message}`); }); } function assert(condition: boolean, message?: string): void { if (!condition) { throw new Error(message || `Assertion failed in "${currentTest}"`); } } function assertEqual<T>(actual: T, expected: T, message?: string): void { if (actual !== expected) { throw new Error( message || `Expected "${expected}" but got "${actual}" in "${currentTest}"` ); } } function assertApproxEqual( actual: number, expected: number, tolerance: number, message?: string ): void { if (Math.abs(actual - expected) > tolerance) { throw new Error( message || `Expected ~${expected} (±${tolerance}) but got ${actual} in "${currentTest}"` ); } } // ============================================================================= // Tests // ============================================================================= async function runTests(): Promise<void> { console.log("\nContamination Detection Tests\n"); // Reset singleton before tests resetContaminationDetector(); // ------------------------------------------------------------------------- // R1: Similarity Checking Tests // ------------------------------------------------------------------------- console.log("R1: Similarity Checking Tests:"); test("detects high similarity to known solution", () => { const detector = new ContaminationDetector(); const knownSolution = "function add(a, b) { return a + b; }"; detector.addToTrainingSet("test-1", knownSolution); const result = detector.checkContamination( "function add(a, b) { return a + b; }", { id: "test-1", name: "Add", difficulty: "easy" } ); assert(result.contaminated === true, "Should detect identical solution as contaminated"); assert(result.checks.similarity !== undefined, "Should have similarity check"); assert(result.checks.similarity!.contaminated === true, "Similarity check should flag contamination"); }); test("passes novel solutions with low similarity", () => { const detector = new ContaminationDetector(); detector.addToTrainingSet( "test-1", "function add(a, b) { return a + b; }" ); const result = detector.checkContamination( "const multiply = (x, y) => x * y;", { id: "test-1", name: "Add", difficulty: "easy" } ); // Novel solution should not be contaminated assert( result.checks.similarity?.contaminated === false, "Novel solution should not be flagged" ); }); test("checkSimilarity returns correct values", () => { const detector = new ContaminationDetector(); const known = "The quick brown fox jumps over the lazy dog"; const identical = detector.checkSimilarity(known, known); assertApproxEqual(identical.similarity, 1.0, 0.01); const different = detector.checkSimilarity( "A completely different sentence with no overlap", known ); assert(different.similarity < 0.5, `Expected low similarity but got ${different.similarity}`); }); test("similarity threshold is configurable", () => { const strictDetector = new ContaminationDetector({ similarityThreshold: 0.5 }); const lenientDetector = new ContaminationDetector({ similarityThreshold: 0.99 }); const known = "The quick brown fox jumps over the lazy dog"; const similar = "The quick brown fox leaps over the lazy dog"; // slightly different strictDetector.addToTrainingSet("test-1", known); lenientDetector.addToTrainingSet("test-1", known); const strictResult = strictDetector.checkContamination(similar, { id: "test-1", name: "Test", difficulty: "easy", }); const lenientResult = lenientDetector.checkContamination(similar, { id: "test-1", name: "Test", difficulty: "easy", }); // Same input, different thresholds should give different results assert( strictResult.checks.similarity!.threshold !== lenientResult.checks.similarity!.threshold, "Thresholds should differ" ); }); // ------------------------------------------------------------------------- // R2: Timing Analysis Tests // ------------------------------------------------------------------------- console.log("\nR2: Timing Analysis Tests:"); test("detects suspiciously fast solve time", () => { const detector = new ContaminationDetector(); const result = detector.checkContamination("solution here", { id: "test-2", name: "Test", difficulty: "hard", solveTime: 5000, // 5 seconds for a "hard" problem (expected 15 min) }); assert(result.checks.timing !== undefined, "Should have timing check"); assert(result.checks.timing!.contaminated === true, "Should flag suspiciously fast solve"); assert(result.contaminated === true, "Overall should be contaminated"); }); test("passes reasonable solve times", () => { const detector = new ContaminationDetector(); const result = detector.checkContamination("solution here", { id: "test-2", name: "Test", difficulty: "medium", solveTime: 200000, // 200 seconds (expected 5 min = 300s) }); assert( result.checks.timing?.contaminated === false, "Reasonable solve time should not be flagged" ); }); test("checkTiming uses correct expected times by difficulty", () => { const detector = new ContaminationDetector(); const easyResult = detector.checkTiming({ id: "t1", name: "Easy", difficulty: "easy", solveTime: 30000, // 30s }); assertEqual(easyResult.expectedTime, EXPECTED_SOLVE_TIMES.easy); const hardResult = detector.checkTiming({ id: "t2", name: "Hard", difficulty: "hard", solveTime: 60000, // 1min }); assertEqual(hardResult.expectedTime, EXPECTED_SOLVE_TIMES.hard); }); test("fast solve threshold is configurable", () => { const strictDetector = new ContaminationDetector({ fastSolveThreshold: 0.5 }); const result = strictDetector.checkTiming({ id: "t1", name: "Test", difficulty: "medium", solveTime: 120000, // 2 min (40% of expected 5 min) }); assert(result.contaminated === true, "40% should be flagged with 50% threshold"); }); // ------------------------------------------------------------------------- // R3: Reasoning Chain Analysis Tests // ------------------------------------------------------------------------- console.log("\nR3: Reasoning Chain Analysis Tests:"); test("detects reasoning chain that jumps to solution", () => { const detector = new ContaminationDetector(); const knownSolution = "The answer is to use map and filter to transform the array"; detector.addToTrainingSet("test-3", knownSolution); // Jump detection checks the first third of thoughts // With 6 thoughts, first third is thoughts 0-1, so solution in position 1 gets detected const result = detector.checkContamination(knownSolution, { id: "test-3", name: "Test", difficulty: "medium", thoughtChain: [ "Let me look at this problem", "The answer is to use map and filter to transform the array", // In first third (1 of 6) "Some more thinking", "Additional consideration", "Further analysis", "Done", ], }); assert(result.checks.reasoning !== undefined, "Should have reasoning check"); assert( result.checks.reasoning!.jumpsToSolution === true, "Should detect jump to solution" ); }); test("passes clean reasoning with exploration", () => { const detector = new ContaminationDetector(); const result = detector.checkContamination("A novel solution approach", { id: "novel-test", name: "Novel", difficulty: "medium", solveTime: 200000, thoughtChain: [ "Let me understand the problem first", "One approach could be using recursion", "But alternatively, I could use iteration", "Actually, what if I combine both approaches?", "Hmm, let me think about edge cases", "Considering the requirements again", "Here is my final solution", ], }); assert( result.checks.reasoning?.jumpsToSolution === false, "Clean reasoning should not be flagged" ); assert( result.checks.reasoning!.explorationDepth >= 3, `Expected exploration depth >= 3 but got ${result.checks.reasoning!.explorationDepth}` ); }); test("analyzeReasoning counts exploration patterns", () => { const detector = new ContaminationDetector(); const result = detector.analyzeReasoning([ "Let me think about this", "What if we try a different approach?", "Alternatively, we could...", "But wait, there's another way", "Actually, I should reconsider", ]); assert(result.explorationDepth >= 4, `Expected depth >= 4 but got ${result.explorationDepth}`); assertEqual(result.thoughtCount, 5); }); test("analyzeReasoning detects suspicious early claims", () => { const detector = new ContaminationDetector(); const result = detector.analyzeReasoning([ "I already know the answer", "The solution is...", ]); assert( result.suspiciousPatterns.some((p) => p.includes("immediate knowledge")), "Should detect early knowledge claims" ); }); test("analyzeReasoning flags short chains", () => { const detector = new ContaminationDetector(); const result = detector.analyzeReasoning(["Here's the answer"]); assert( result.suspiciousPatterns.some((p) => p.includes("short")), "Should flag very short chains" ); }); // ------------------------------------------------------------------------- // R4: Training Set Fingerprinting Tests // ------------------------------------------------------------------------- console.log("\nR4: Training Set Fingerprinting Tests:"); test("addToTrainingSet adds solutions correctly", () => { const detector = new ContaminationDetector(); detector.addToTrainingSet("case-1", "Solution A"); detector.addToTrainingSet("case-2", "Solution B"); assertEqual(detector.getTrainingSetSize(), 2); }); test("isInTrainingSet detects exact matches", () => { const detector = new ContaminationDetector(); const solution = "Exact solution text"; detector.addToTrainingSet("case-1", solution); assert(detector.isInTrainingSet(solution) === true, "Should find exact match"); assert( detector.isInTrainingSet("Different text") === false, "Should not find different text" ); }); test("isInTrainingSet normalizes whitespace", () => { const detector = new ContaminationDetector(); detector.addToTrainingSet("case-1", "solution with spaces"); assert( detector.isInTrainingSet("solution with spaces") === true, "Should match with normalized whitespace" ); }); test("loadTrainingSet loads multiple solutions", () => { const detector = new ContaminationDetector(); const solutions: KnownSolution[] = [ { testCaseId: "test-1", solution: "Solution 1", hash: "hash1", addedToTraining: "2026-01-01", }, { testCaseId: "test-2", solution: "Solution 2", hash: "hash2", addedToTraining: "2026-01-01", }, ]; detector.loadTrainingSet(solutions); assertEqual(detector.getTrainingSetSize(), 2); assert(detector.isInTrainingSet("Solution 1") === true, "Should find loaded solution"); }); test("clearTrainingSet removes all solutions", () => { const detector = new ContaminationDetector(); detector.addToTrainingSet("case-1", "Solution"); detector.clearTrainingSet(); assertEqual(detector.getTrainingSetSize(), 0); }); // ------------------------------------------------------------------------- // Confidence Calculation Tests // ------------------------------------------------------------------------- console.log("\nConfidence Calculation Tests:"); test("confidence is calculated based on check results", () => { const detector = new ContaminationDetector(); const knownSolution = "Known solution text for this test case"; detector.addToTrainingSet("test-1", knownSolution); // Contaminated case - high similarity, fast solve, short chain const contaminated = detector.checkContamination(knownSolution, { id: "test-1", name: "Test", difficulty: "hard", solveTime: 1000, // Very fast thoughtChain: [knownSolution], // Short chain }); // Clean case - novel solution, reasonable time, good reasoning const clean = detector.checkContamination("Completely different novel solution", { id: "unknown-test", name: "Unknown", difficulty: "easy", solveTime: 50000, // Reasonable thoughtChain: [ "Let me think about this", "What if I try this approach?", "Alternatively, I could...", "After considering, here's my answer", ], }); // Both should have reasonable confidence (0-1 range) assert( contaminated.confidence >= 0 && contaminated.confidence <= 1, `Contaminated confidence should be 0-1, got ${contaminated.confidence}` ); assert( clean.confidence >= 0 && clean.confidence <= 1, `Clean confidence should be 0-1, got ${clean.confidence}` ); // Contaminated case should flag contamination assert(contaminated.contaminated === true, "Should detect contamination"); // Clean case should not flag contamination assert(clean.contaminated === false, "Should not flag clean case"); }); // ------------------------------------------------------------------------- // Default Configuration Tests // ------------------------------------------------------------------------- console.log("\nDefault Configuration Tests:"); test("DEFAULT_CONTAMINATION_CONFIG has expected values", () => { assertEqual(DEFAULT_CONTAMINATION_CONFIG.similarityThreshold, 0.95); assertEqual(DEFAULT_CONTAMINATION_CONFIG.fastSolveThreshold, 0.1); assertEqual(DEFAULT_CONTAMINATION_CONFIG.minExplorationDepth, 3); }); test("EXPECTED_SOLVE_TIMES has values for all difficulties", () => { assert(EXPECTED_SOLVE_TIMES.easy > 0, "Should have easy time"); assert(EXPECTED_SOLVE_TIMES.medium > 0, "Should have medium time"); assert(EXPECTED_SOLVE_TIMES.hard > 0, "Should have hard time"); assert( EXPECTED_SOLVE_TIMES.easy < EXPECTED_SOLVE_TIMES.medium, "Easy should be faster than medium" ); assert( EXPECTED_SOLVE_TIMES.medium < EXPECTED_SOLVE_TIMES.hard, "Medium should be faster than hard" ); }); // ------------------------------------------------------------------------- // Wait and Report // ------------------------------------------------------------------------- await new Promise((resolve) => setTimeout(resolve, 200)); console.log("\n" + "=".repeat(50)); console.log(`Tests Passed: ${testsPassed}`); console.log(`Tests Failed: ${testsFailed}`); console.log("=".repeat(50)); if (testsFailed > 0) { process.exit(1); } } // Run tests runTests().catch((err) => { console.error("Test runner error:", err); process.exit(1); });

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Kastalien-Research/thoughtbox'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

contamination.test.ts•15.9 KiB