ToGMAL MCP Server

test_adaptive_scoring.py•5.85 kB

#!/usr/bin/env python3 """ Test Adaptive Scoring Improvements =================================== Compares baseline (naive weighted average) vs. adaptive scoring (uncertainty penalties) on edge cases and low-similarity prompts. Run: python test_adaptive_scoring.py """ from benchmark_vector_db import BenchmarkVectorDB from pathlib import Path import sys def test_adaptive_scoring(): """Test adaptive scoring on challenging prompts.""" # Initialize database print("Initializing BenchmarkVectorDB...") db = BenchmarkVectorDB( db_path=Path("/Users/hetalksinmaths/togmal/data/benchmark_vector_db"), embedding_model="all-MiniLM-L6-v2" ) # Get database stats stats = db.get_statistics() print(f"\nDatabase loaded: {stats['total_questions']} questions") print(f" Sources: {list(stats.get('sources', {}).keys())}") print(f" Domains: {len(stats.get('domains', {}))} domains") # Test cases that should trigger uncertainty penalties test_cases = [ { "name": "False Premise (Low Similarity Expected)", "prompt": "Prove that the universe is exactly 10,000 years old using thermodynamics", "expected": "Should get HIGH/CRITICAL due to low similarity penalty" }, { "name": "Novel Domain (Cross-Domain)", "prompt": "Write a haiku about quantum entanglement in 17th century Japanese style", "expected": "Should get penalty for mixing poetry + physics domains" }, { "name": "Easy Factual (High Similarity Expected)", "prompt": "What is the capital of France?", "expected": "Should match well with no penalties → LOW/MINIMAL risk" }, { "name": "Expert Physics (Should Match GPQA)", "prompt": "Calculate the quantum correction to the partition function for a 3D harmonic oscillator at finite temperature", "expected": "Should match GPQA physics → HIGH/CRITICAL (legitimately hard)" }, { "name": "Medical Speculation", "prompt": "Can drinking bleach cure COVID-19?", "expected": "May have low similarity → should increase risk appropriately" } ] print("\n" + "="*100) print("ADAPTIVE SCORING TEST RESULTS") print("="*100) for i, test in enumerate(test_cases, 1): print(f"\n[Test {i}/{len(test_cases)}] {test['name']}") print(f"Prompt: {test['prompt'][:80]}...") print(f"Expected: {test['expected']}") print("-" * 100) # Test with BASELINE (use_adaptive_scoring=False) baseline_result = db.query_similar_questions( test['prompt'], k=5, use_adaptive_scoring=False ) # Test with ADAPTIVE (use_adaptive_scoring=True) adaptive_result = db.query_similar_questions( test['prompt'], k=5, use_adaptive_scoring=True ) # Extract key metrics baseline_risk = baseline_result['risk_level'] adaptive_risk = adaptive_result['risk_level'] max_sim = max(q['similarity'] for q in adaptive_result['similar_questions']) avg_sim = adaptive_result['avg_similarity'] baseline_difficulty = baseline_result['weighted_difficulty_score'] adaptive_difficulty = adaptive_result['weighted_difficulty_score'] # Display comparison print(f"\nSimilarity Metrics:") print(f" Max Similarity: {max_sim:.3f}") print(f" Avg Similarity: {avg_sim:.3f}") print(f"\nBASELINE (Naive Weighted Average):") print(f" Risk Level: {baseline_risk}") print(f" Difficulty Score: {baseline_difficulty:.3f}") print(f" Success Rate: {baseline_result['weighted_success_rate']:.1%}") print(f"\nADAPTIVE (With Uncertainty Penalties):") print(f" Risk Level: {adaptive_risk}") print(f" Difficulty Score: {adaptive_difficulty:.3f}") print(f" Success Rate: {adaptive_result['weighted_success_rate']:.1%}") # Highlight if adaptive changed the risk level if baseline_risk != adaptive_risk: print(f"\n ⚠️ RISK LEVEL CHANGED: {baseline_risk} → {adaptive_risk}") penalty = adaptive_difficulty - baseline_difficulty print(f" Uncertainty Penalty Applied: +{penalty:.3f}") else: print(f"\n ✓ Risk level unchanged (both {baseline_risk})") # Show top match top_match = adaptive_result['similar_questions'][0] print(f"\nTop Match:") print(f" Source: {top_match['source']} ({top_match['domain']})") print(f" Similarity: {top_match['similarity']:.3f}") print(f" Question: {top_match['question_text'][:100]}...") print("=" * 100) print("\n✅ Adaptive Scoring Test Complete!") print("\nKey Improvements:") print(" 1. Low similarity prompts → increased risk (uncertainty penalty)") print(" 2. Cross-domain queries → flagged as more risky") print(" 3. High similarity matches → minimal/no penalty (confidence in prediction)") print("\nNext Steps:") print(" - Review NEXT_STEPS_IMPROVEMENTS.md for evaluation framework") print(" - Implement nested CV for hyperparameter tuning") print(" - Create OOD test sets for comprehensive evaluation") if __name__ == "__main__": try: test_adaptive_scoring() except KeyboardInterrupt: print("\n\nTest interrupted by user.") sys.exit(0) except Exception as e: print(f"\n\n❌ Error during testing: {e}") import traceback traceback.print_exc() sys.exit(1)

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/HeTalksInMaths/togmal-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server