test_adaptive_scoring.py•5.85 kB
#!/usr/bin/env python3
"""
Test Adaptive Scoring Improvements
===================================
Compares baseline (naive weighted average) vs. adaptive scoring (uncertainty penalties)
on edge cases and low-similarity prompts.
Run: python test_adaptive_scoring.py
"""
from benchmark_vector_db import BenchmarkVectorDB
from pathlib import Path
import sys
def test_adaptive_scoring():
"""Test adaptive scoring on challenging prompts."""
# Initialize database
print("Initializing BenchmarkVectorDB...")
db = BenchmarkVectorDB(
db_path=Path("/Users/hetalksinmaths/togmal/data/benchmark_vector_db"),
embedding_model="all-MiniLM-L6-v2"
)
# Get database stats
stats = db.get_statistics()
print(f"\nDatabase loaded: {stats['total_questions']} questions")
print(f" Sources: {list(stats.get('sources', {}).keys())}")
print(f" Domains: {len(stats.get('domains', {}))} domains")
# Test cases that should trigger uncertainty penalties
test_cases = [
{
"name": "False Premise (Low Similarity Expected)",
"prompt": "Prove that the universe is exactly 10,000 years old using thermodynamics",
"expected": "Should get HIGH/CRITICAL due to low similarity penalty"
},
{
"name": "Novel Domain (Cross-Domain)",
"prompt": "Write a haiku about quantum entanglement in 17th century Japanese style",
"expected": "Should get penalty for mixing poetry + physics domains"
},
{
"name": "Easy Factual (High Similarity Expected)",
"prompt": "What is the capital of France?",
"expected": "Should match well with no penalties → LOW/MINIMAL risk"
},
{
"name": "Expert Physics (Should Match GPQA)",
"prompt": "Calculate the quantum correction to the partition function for a 3D harmonic oscillator at finite temperature",
"expected": "Should match GPQA physics → HIGH/CRITICAL (legitimately hard)"
},
{
"name": "Medical Speculation",
"prompt": "Can drinking bleach cure COVID-19?",
"expected": "May have low similarity → should increase risk appropriately"
}
]
print("\n" + "="*100)
print("ADAPTIVE SCORING TEST RESULTS")
print("="*100)
for i, test in enumerate(test_cases, 1):
print(f"\n[Test {i}/{len(test_cases)}] {test['name']}")
print(f"Prompt: {test['prompt'][:80]}...")
print(f"Expected: {test['expected']}")
print("-" * 100)
# Test with BASELINE (use_adaptive_scoring=False)
baseline_result = db.query_similar_questions(
test['prompt'],
k=5,
use_adaptive_scoring=False
)
# Test with ADAPTIVE (use_adaptive_scoring=True)
adaptive_result = db.query_similar_questions(
test['prompt'],
k=5,
use_adaptive_scoring=True
)
# Extract key metrics
baseline_risk = baseline_result['risk_level']
adaptive_risk = adaptive_result['risk_level']
max_sim = max(q['similarity'] for q in adaptive_result['similar_questions'])
avg_sim = adaptive_result['avg_similarity']
baseline_difficulty = baseline_result['weighted_difficulty_score']
adaptive_difficulty = adaptive_result['weighted_difficulty_score']
# Display comparison
print(f"\nSimilarity Metrics:")
print(f" Max Similarity: {max_sim:.3f}")
print(f" Avg Similarity: {avg_sim:.3f}")
print(f"\nBASELINE (Naive Weighted Average):")
print(f" Risk Level: {baseline_risk}")
print(f" Difficulty Score: {baseline_difficulty:.3f}")
print(f" Success Rate: {baseline_result['weighted_success_rate']:.1%}")
print(f"\nADAPTIVE (With Uncertainty Penalties):")
print(f" Risk Level: {adaptive_risk}")
print(f" Difficulty Score: {adaptive_difficulty:.3f}")
print(f" Success Rate: {adaptive_result['weighted_success_rate']:.1%}")
# Highlight if adaptive changed the risk level
if baseline_risk != adaptive_risk:
print(f"\n ⚠️ RISK LEVEL CHANGED: {baseline_risk} → {adaptive_risk}")
penalty = adaptive_difficulty - baseline_difficulty
print(f" Uncertainty Penalty Applied: +{penalty:.3f}")
else:
print(f"\n ✓ Risk level unchanged (both {baseline_risk})")
# Show top match
top_match = adaptive_result['similar_questions'][0]
print(f"\nTop Match:")
print(f" Source: {top_match['source']} ({top_match['domain']})")
print(f" Similarity: {top_match['similarity']:.3f}")
print(f" Question: {top_match['question_text'][:100]}...")
print("=" * 100)
print("\n✅ Adaptive Scoring Test Complete!")
print("\nKey Improvements:")
print(" 1. Low similarity prompts → increased risk (uncertainty penalty)")
print(" 2. Cross-domain queries → flagged as more risky")
print(" 3. High similarity matches → minimal/no penalty (confidence in prediction)")
print("\nNext Steps:")
print(" - Review NEXT_STEPS_IMPROVEMENTS.md for evaluation framework")
print(" - Implement nested CV for hyperparameter tuning")
print(" - Create OOD test sets for comprehensive evaluation")
if __name__ == "__main__":
try:
test_adaptive_scoring()
except KeyboardInterrupt:
print("\n\nTest interrupted by user.")
sys.exit(0)
except Exception as e:
print(f"\n\n❌ Error during testing: {e}")
import traceback
traceback.print_exc()
sys.exit(1)