Skip to main content
Glama
rescore_fallback.py9.41 kB
#!/usr/bin/env python3 """ Re-score memories with fallback MS-MARCO + DeBERTa approach. This script re-evaluates all DeBERTa-scored memories using the new fallback approach: DeBERTa primary with MS-MARCO rescue for technical content. """ import asyncio import sys from pathlib import Path from datetime import datetime from collections import Counter sys.path.insert(0, str(Path(__file__).parent.parent.parent)) from mcp_memory_service.storage.sqlite_vec import SqliteVecMemoryStorage from mcp_memory_service.config import SQLITE_VEC_PATH from mcp_memory_service.quality.onnx_ranker import get_onnx_ranker_model # Default thresholds (can be overridden via environment variables) DEFAULT_DEBERTA_THRESHOLD = 0.6 DEFAULT_MS_MARCO_THRESHOLD = 0.7 async def rescore_fallback( dry_run=True, deberta_threshold=DEFAULT_DEBERTA_THRESHOLD, ms_marco_threshold=DEFAULT_MS_MARCO_THRESHOLD ): """ Re-score all DeBERTa memories with fallback approach. Args: dry_run: If True, only report what would change (default) deberta_threshold: DeBERTa confidence threshold (default: 0.6) ms_marco_threshold: MS-MARCO rescue threshold (default: 0.7) """ print("=" * 80) print(f"Fallback Re-scoring - {'DRY RUN' if dry_run else 'LIVE MODE'}") print("=" * 80) print(f"DeBERTa threshold: {deberta_threshold}") print(f"MS-MARCO threshold: {ms_marco_threshold}") print() # Load both models print("Loading models...") deberta = get_onnx_ranker_model('nvidia-quality-classifier-deberta', 'auto') ms_marco = get_onnx_ranker_model('ms-marco-MiniLM-L-6-v2', 'auto') if not deberta or not ms_marco: print("❌ Failed to load both models") print() if not deberta: print(" DeBERTa model not available") if not ms_marco: print(" MS-MARCO model not available") return print(f"✓ DeBERTa loaded") print(f"✓ MS-MARCO loaded") print() # Connect to storage print("Connecting to database...") storage = SqliteVecMemoryStorage(SQLITE_VEC_PATH) await storage.initialize() print(f"✓ Connected to: {SQLITE_VEC_PATH}") print() # Get DeBERTa-scored memories all_memories = await storage.get_all_memories() to_rescore = [ m for m in all_memories if m.metadata and m.metadata.get('quality_provider') == 'onnx_deberta' ] print(f"Total memories: {len(all_memories)}") print(f"Memories to re-score: {len(to_rescore)} (onnx_deberta only)") print() if len(to_rescore) == 0: print("✓ No memories to re-score!") return improvements = [] decision_counts = { 'deberta_confident': 0, 'ms_marco_rescue': 0, 'both_low': 0 } print("Re-scoring memories...") print() for i, memory in enumerate(to_rescore, 1): old_score = memory.metadata.get('quality_score', 0.5) # Step 1: Score with DeBERTa (query-independent) deberta_score = deberta.score_quality("", memory.content) # Step 2: Apply fallback logic if deberta_score >= deberta_threshold: # DeBERTa confident - use it final_score = deberta_score decision = 'deberta_confident' ms_marco_score = None else: # DeBERTa low - try MS-MARCO rescue # Use empty query to avoid self-matching bias ms_marco_score = ms_marco.score_quality("", memory.content) if ms_marco_score >= ms_marco_threshold: # MS-MARCO rescue final_score = ms_marco_score decision = 'ms_marco_rescue' else: # Both agree low final_score = deberta_score decision = 'both_low' decision_counts[decision] += 1 delta = final_score - old_score # Track significant changes (>0.1 difference) if abs(delta) > 0.1: improvements.append({ 'content': memory.content[:80].replace('\n', ' '), 'old': old_score, 'deberta': deberta_score, 'ms_marco': ms_marco_score, 'final': final_score, 'decision': decision, 'delta': delta, 'hash': memory.content_hash }) if not dry_run: # Build quality_components dict components = { 'final_score': final_score, 'deberta_score': deberta_score, 'decision': decision } if ms_marco_score is not None: components['ms_marco_score'] = ms_marco_score # Update memory metadata await storage.update_memory_metadata( content_hash=memory.content_hash, updates={ 'quality_score': final_score, 'quality_provider': 'fallback_deberta-msmarco', 'quality_components': components } ) # Progress indicator if i % 100 == 0: ms_str = f"M:{ms_marco_score:.3f}" if ms_marco_score else "M:N/A" print( f" [{i:5d}/{len(to_rescore)}] {decision[:4].upper()} | " f"Final: {final_score:.3f} (D:{deberta_score:.3f}, {ms_str})" ) # Report decision distribution print() print("=" * 80) print("Decision Distribution") print("=" * 80) for decision, count in decision_counts.items(): pct = (count / len(to_rescore)) * 100 if to_rescore else 0 print(f"{decision:20s}: {count:5d} ({pct:5.1f}%)") print() # Report top improvements print("=" * 80) print("Top Improvements (fallback vs DeBERTa-only)") print("=" * 80) improvements.sort(key=lambda x: -x['delta']) for imp in improvements[:15]: ms_str = f"M:{imp['ms_marco']:.3f}" if imp['ms_marco'] else "M:N/A" print( f"Delta: {imp['delta']:+.3f} | Old: {imp['old']:.3f} → " f"New: {imp['final']:.3f} ({imp['decision']})" ) print(f" DeBERTa: {imp['deberta']:.3f}, {ms_str}") print(f" {imp['content']}") print() if dry_run: print("=" * 80) print("DRY RUN COMPLETE - No changes made") print("=" * 80) print() print("To execute re-scoring, run:") print(f" python {__file__} --execute") print() print("To adjust thresholds:") print(f" python {__file__} --execute --deberta-threshold 0.5 --msmarco-threshold 0.6") return # Execute mode - create log print("=" * 80) print("CLEANUP COMPLETE") print("=" * 80) print() log_path = Path.home() / 'backups/mcp-memory-service' / \ f'rescore-fallback-{datetime.now().strftime("%Y%m%d-%H%M%S")}.txt' log_path.parent.mkdir(parents=True, exist_ok=True) with open(log_path, 'w') as log: log.write(f"Fallback re-scoring executed: {datetime.now()}\n") log.write(f"DeBERTa threshold: {deberta_threshold}\n") log.write(f"MS-MARCO threshold: {ms_marco_threshold}\n") log.write(f"Total re-scored: {len(to_rescore)}\n\n") log.write("Decision Distribution:\n") for decision, count in decision_counts.items(): pct = (count / len(to_rescore)) * 100 if to_rescore else 0 log.write(f" {decision:20s}: {count:5d} ({pct:5.1f}%)\n") log.write("\n") log.write("Top Improvements:\n") for imp in improvements[:50]: # Log top 50 log.write( f"{imp['hash']}\t{imp['old']:.4f}\t{imp['final']:.4f}\t" f"{imp['delta']:+.4f}\t{imp['decision']}\n" ) print(f"✓ Updated: {len(to_rescore)} memories") print(f" DeBERTa confident: {decision_counts['deberta_confident']}") print(f" MS-MARCO rescue: {decision_counts['ms_marco_rescue']}") print(f" Both low: {decision_counts['both_low']}") print() print(f"📝 Log saved: {log_path}") print() # Verify final state remaining = await storage.get_all_memories() fallback_count = sum( 1 for m in remaining if m.metadata and m.metadata.get('quality_provider') == 'fallback_deberta-msmarco' ) print(f"Database state:") print(f" Total memories: {len(remaining)}") print(f" Fallback-scored: {fallback_count}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Re-score memories with DeBERTa + MS-MARCO fallback approach" ) parser.add_argument( '--execute', action='store_true', help="Execute re-scoring (default: dry-run only)" ) parser.add_argument( '--deberta-threshold', type=float, default=DEFAULT_DEBERTA_THRESHOLD, help=f"DeBERTa confidence threshold (default: {DEFAULT_DEBERTA_THRESHOLD})" ) parser.add_argument( '--msmarco-threshold', type=float, default=DEFAULT_MS_MARCO_THRESHOLD, help=f"MS-MARCO rescue threshold (default: {DEFAULT_MS_MARCO_THRESHOLD})" ) args = parser.parse_args() asyncio.run(rescore_fallback( dry_run=not args.execute, deberta_threshold=args.deberta_threshold, ms_marco_threshold=args.msmarco_threshold ))

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/doobidoo/mcp-memory-service'

If you have feedback or need assistance with the MCP directory API, please join our Discord server