Skip to main content
Glama
test_deberta_quality.py13.6 kB
"""Tests for NVIDIA DeBERTa quality classifier integration.""" import pytest from pathlib import Path from mcp_memory_service.quality.onnx_ranker import ONNXRankerModel, get_onnx_ranker_model from mcp_memory_service.quality.config import QualityConfig, SUPPORTED_MODELS # Check if ONNX models are available DEBERTA_AVAILABLE = Path.home().joinpath( ".cache/mcp_memory/onnx_models/nvidia-quality-classifier-deberta/model.onnx" ).exists() MS_MARCO_AVAILABLE = Path.home().joinpath( ".cache/mcp_memory/onnx_models/ms-marco-MiniLM-L-6-v2/model.onnx" ).exists() class TestModelRegistry: """Test model registry configuration.""" def test_supported_models_exist(self): """Verify SUPPORTED_MODELS contains both models.""" assert 'nvidia-quality-classifier-deberta' in SUPPORTED_MODELS assert 'ms-marco-MiniLM-L-6-v2' in SUPPORTED_MODELS def test_deberta_config_structure(self): """Verify DeBERTa model configuration.""" config = SUPPORTED_MODELS['nvidia-quality-classifier-deberta'] assert config['hf_name'] == 'nvidia/quality-classifier-deberta' assert config['type'] == 'classifier' assert config['size_mb'] == 450 assert 'input_ids' in config['inputs'] assert 'attention_mask' in config['inputs'] assert 'token_type_ids' not in config['inputs'] # Key difference from MS-MARCO assert config['output_classes'] == ['low', 'medium', 'high'] def test_ms_marco_config_structure(self): """Verify MS-MARCO model configuration.""" config = SUPPORTED_MODELS['ms-marco-MiniLM-L-6-v2'] assert config['hf_name'] == 'cross-encoder/ms-marco-MiniLM-L-6-v2' assert config['type'] == 'cross-encoder' assert config['size_mb'] == 23 assert 'input_ids' in config['inputs'] assert 'attention_mask' in config['inputs'] assert 'token_type_ids' in config['inputs'] # Required for cross-encoder assert config['output_classes'] is None # Continuous score def test_config_default_is_deberta(self): """Verify DeBERTa is the default model.""" config = QualityConfig() assert config.local_model == 'nvidia-quality-classifier-deberta' @pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported") class TestDeBERTaIntegration: """Test NVIDIA DeBERTa quality classifier.""" def test_deberta_initialization(self): """Test DeBERTa model initialization.""" model = get_onnx_ranker_model( model_name='nvidia-quality-classifier-deberta', device='cpu' ) assert model is not None assert model.model_name == 'nvidia-quality-classifier-deberta' assert model.model_config['type'] == 'classifier' def test_deberta_absolute_quality_scoring(self): """Test DeBERTa scores absolute quality (query-independent).""" model = get_onnx_ranker_model( model_name='nvidia-quality-classifier-deberta', device='cpu' ) # High-quality memory content high_quality = ( "This implementation uses async/await patterns with proper error handling, " "comprehensive logging, and follows Python best practices. The code includes " "detailed docstrings, type hints, and handles edge cases gracefully. " "Performance is optimized with caching and batch processing." ) # Low-quality memory content low_quality = "todo fix this later maybe idk lol" # Medium-quality memory content medium_quality = "Basic implementation that works but lacks documentation and error handling." # Query should not matter for classifier - test with different queries query_meaningful = "Python best practices" query_random = "xyzabc random words 12345" query_empty = "" # Score high-quality content with different queries high_score_1 = model.score_quality(query_meaningful, high_quality) high_score_2 = model.score_quality(query_random, high_quality) high_score_3 = model.score_quality(query_empty, high_quality) # Score low-quality content with different queries low_score_1 = model.score_quality(query_meaningful, low_quality) low_score_2 = model.score_quality(query_random, low_quality) low_score_3 = model.score_quality(query_empty, low_quality) # Score medium-quality content medium_score = model.score_quality(query_empty, medium_quality) # High-quality content should score high (≥0.5) # Note: DeBERTa is more conservative than MS-MARCO, rarely giving scores >0.7 assert high_score_1 >= 0.5, f"Expected high score ≥0.5, got {high_score_1}" assert high_score_2 >= 0.5, f"Expected high score ≥0.5, got {high_score_2}" assert high_score_3 >= 0.5, f"Expected high score ≥0.5, got {high_score_3}" # Low-quality content should score low (<0.4) assert low_score_1 < 0.4, f"Expected low score <0.4, got {low_score_1}" assert low_score_2 < 0.4, f"Expected low score <0.4, got {low_score_2}" assert low_score_3 < 0.4, f"Expected low score <0.4, got {low_score_3}" # Medium-quality content should be in middle range assert 0.4 <= medium_score <= 0.7, f"Expected medium score 0.4-0.7, got {medium_score}" # Scores should be similar regardless of query (±0.1 tolerance) # This validates query-independent behavior assert abs(high_score_1 - high_score_2) < 0.1, "High quality scores vary too much with query" assert abs(high_score_1 - high_score_3) < 0.1, "High quality scores vary too much with query" assert abs(low_score_1 - low_score_2) < 0.1, "Low quality scores vary too much with query" assert abs(low_score_1 - low_score_3) < 0.1, "Low quality scores vary too much with query" # Quality ranking should be consistent assert high_score_1 > medium_score > low_score_1, "Quality ranking incorrect" def test_deberta_3class_output_mapping(self): """Test 3-class output mapping to 0-1 scale.""" model = get_onnx_ranker_model( model_name='nvidia-quality-classifier-deberta', device='cpu' ) # Test various quality levels with concrete, specific content # (DeBERTa performs better on specific technical content vs abstract descriptions) excellent = ( "The implementation uses a sophisticated multi-tier architecture with semantic analysis, " "pattern matching, and adaptive learning algorithms to optimize retrieval accuracy." ) average = "The code does some processing and returns a result." poor = "stuff things maybe later TODO" scores = [ model.score_quality("", excellent), model.score_quality("", average), model.score_quality("", poor) ] # Verify ordering (excellent > average > poor) assert scores[0] > scores[1] > scores[2], f"Scores not ordered correctly: {scores}" # Verify all scores in valid range [0.0, 1.0] assert all(0.0 <= s <= 1.0 for s in scores), f"Scores out of range: {scores}" # Verify reasonable spread (not all clustered near 0.5) score_range = max(scores) - min(scores) assert score_range > 0.2, f"Scores too clustered (range: {score_range})" def test_deberta_empty_content_handling(self): """Test DeBERTa handles empty content gracefully.""" model = get_onnx_ranker_model( model_name='nvidia-quality-classifier-deberta', device='cpu' ) # Empty content should return 0.0 assert model.score_quality("", "") == 0.0 assert model.score_quality("query", "") == 0.0 @pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO ONNX model not found") class TestBackwardCompatibility: """Test backward compatibility with MS-MARCO model.""" def test_ms_marco_still_works(self): """Verify MS-MARCO model still initializes and works.""" model = get_onnx_ranker_model( model_name='ms-marco-MiniLM-L-6-v2', device='cpu' ) assert model is not None assert model.model_name == 'ms-marco-MiniLM-L-6-v2' assert model.model_config['type'] == 'cross-encoder' def test_ms_marco_relevance_scoring(self): """Test MS-MARCO scores query-document relevance.""" model = get_onnx_ranker_model( model_name='ms-marco-MiniLM-L-6-v2', device='cpu' ) content = "Python async/await patterns for concurrent programming." # Relevant query should score higher relevant_query = "Python asynchronous programming" irrelevant_query = "Java database optimization" relevant_score = model.score_quality(relevant_query, content) irrelevant_score = model.score_quality(irrelevant_query, content) # MS-MARCO should give higher score to relevant query assert relevant_score > irrelevant_score, ( f"MS-MARCO should score relevant query higher: " f"relevant={relevant_score}, irrelevant={irrelevant_score}" ) # Both scores should be valid assert 0.0 <= relevant_score <= 1.0 assert 0.0 <= irrelevant_score <= 1.0 def test_ms_marco_requires_query(self): """Test MS-MARCO requires non-empty query.""" model = get_onnx_ranker_model( model_name='ms-marco-MiniLM-L-6-v2', device='cpu' ) content = "Test content" # Empty query should return 0.0 for cross-encoder assert model.score_quality("", content) == 0.0 # Non-empty query should work score = model.score_quality("test query", content) assert 0.0 < score <= 1.0 def test_config_override_to_ms_marco(self): """Test users can override config back to MS-MARCO.""" import os original_value = os.getenv('MCP_QUALITY_LOCAL_MODEL') try: os.environ['MCP_QUALITY_LOCAL_MODEL'] = 'ms-marco-MiniLM-L-6-v2' config = QualityConfig.from_env() assert config.local_model == 'ms-marco-MiniLM-L-6-v2' finally: # Restore original value if original_value: os.environ['MCP_QUALITY_LOCAL_MODEL'] = original_value else: os.environ.pop('MCP_QUALITY_LOCAL_MODEL', None) @pytest.mark.benchmark @pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported") class TestPerformance: """Performance benchmarks for DeBERTa.""" def test_deberta_inference_speed(self): """Test DeBERTa inference completes within acceptable time.""" import time model = get_onnx_ranker_model( model_name='nvidia-quality-classifier-deberta', device='cpu' ) content = "Sample memory content for benchmarking quality scoring performance." # Warm-up inference model.score_quality("", content) # Benchmark 10 inferences start_time = time.time() for _ in range(10): score = model.score_quality("", content) assert 0.0 <= score <= 1.0 elapsed_time = (time.time() - start_time) * 1000 # Convert to ms avg_time_per_inference = elapsed_time / 10 # Target: <200ms on CPU (DeBERTa is larger than MS-MARCO) assert avg_time_per_inference < 200, ( f"DeBERTa inference too slow: {avg_time_per_inference:.1f}ms " f"(target: <200ms on CPU)" ) print(f"\nDeBERTa performance: {avg_time_per_inference:.1f}ms per inference") @pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO model not found") def test_performance_comparison(self): """Compare DeBERTa vs MS-MARCO inference speed.""" import time deberta = get_onnx_ranker_model( model_name='nvidia-quality-classifier-deberta', device='cpu' ) ms_marco = get_onnx_ranker_model( model_name='ms-marco-MiniLM-L-6-v2', device='cpu' ) content = "Test content for performance comparison." query = "test query" # Warm-up deberta.score_quality("", content) ms_marco.score_quality(query, content) # Benchmark DeBERTa start = time.time() for _ in range(10): deberta.score_quality("", content) deberta_time = (time.time() - start) * 1000 / 10 # Benchmark MS-MARCO start = time.time() for _ in range(10): ms_marco.score_quality(query, content) ms_marco_time = (time.time() - start) * 1000 / 10 print(f"\nPerformance comparison:") print(f" DeBERTa: {deberta_time:.1f}ms per inference") print(f" MS-MARCO: {ms_marco_time:.1f}ms per inference") print(f" Overhead: {((deberta_time / ms_marco_time - 1) * 100):.1f}%") # DeBERTa should be slower (larger model: 450MB vs 23MB) # Target: <10x slower than MS-MARCO (typically 6-7x on CPU, 2-3x on GPU) # Note: Trade-off for better quality assessment and no self-matching bias assert deberta_time < ms_marco_time * 10, ( f"DeBERTa excessively slow compared to MS-MARCO: " f"{deberta_time:.1f}ms vs {ms_marco_time:.1f}ms" ) if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short", "-s"])

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/doobidoo/mcp-memory-service'

If you have feedback or need assistance with the MCP directory API, please join our Discord server