We provide all the information about MCP servers via our MCP API.
curl -X GET 'https://glama.ai/api/mcp/v1/servers/doobidoo/mcp-memory-service'
If you have feedback or need assistance with the MCP directory API, please join our Discord server
"""Tests for NVIDIA DeBERTa quality classifier integration."""
import pytest
from pathlib import Path
from mcp_memory_service.quality.onnx_ranker import ONNXRankerModel, get_onnx_ranker_model
from mcp_memory_service.quality.config import QualityConfig, SUPPORTED_MODELS
# Check if ONNX models are available
DEBERTA_AVAILABLE = Path.home().joinpath(
".cache/mcp_memory/onnx_models/nvidia-quality-classifier-deberta/model.onnx"
).exists()
MS_MARCO_AVAILABLE = Path.home().joinpath(
".cache/mcp_memory/onnx_models/ms-marco-MiniLM-L-6-v2/model.onnx"
).exists()
class TestModelRegistry:
"""Test model registry configuration."""
def test_supported_models_exist(self):
"""Verify SUPPORTED_MODELS contains both models."""
assert 'nvidia-quality-classifier-deberta' in SUPPORTED_MODELS
assert 'ms-marco-MiniLM-L-6-v2' in SUPPORTED_MODELS
def test_deberta_config_structure(self):
"""Verify DeBERTa model configuration."""
config = SUPPORTED_MODELS['nvidia-quality-classifier-deberta']
assert config['hf_name'] == 'nvidia/quality-classifier-deberta'
assert config['type'] == 'classifier'
assert config['size_mb'] == 450
assert 'input_ids' in config['inputs']
assert 'attention_mask' in config['inputs']
assert 'token_type_ids' not in config['inputs'] # Key difference from MS-MARCO
assert config['output_classes'] == ['low', 'medium', 'high']
def test_ms_marco_config_structure(self):
"""Verify MS-MARCO model configuration."""
config = SUPPORTED_MODELS['ms-marco-MiniLM-L-6-v2']
assert config['hf_name'] == 'cross-encoder/ms-marco-MiniLM-L-6-v2'
assert config['type'] == 'cross-encoder'
assert config['size_mb'] == 23
assert 'input_ids' in config['inputs']
assert 'attention_mask' in config['inputs']
assert 'token_type_ids' in config['inputs'] # Required for cross-encoder
assert config['output_classes'] is None # Continuous score
def test_config_default_is_deberta(self):
"""Verify DeBERTa is the default model."""
config = QualityConfig()
assert config.local_model == 'nvidia-quality-classifier-deberta'
@pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported")
class TestDeBERTaIntegration:
"""Test NVIDIA DeBERTa quality classifier."""
def test_deberta_initialization(self):
"""Test DeBERTa model initialization."""
model = get_onnx_ranker_model(
model_name='nvidia-quality-classifier-deberta',
device='cpu'
)
assert model is not None
assert model.model_name == 'nvidia-quality-classifier-deberta'
assert model.model_config['type'] == 'classifier'
def test_deberta_absolute_quality_scoring(self):
"""Test DeBERTa scores absolute quality (query-independent)."""
model = get_onnx_ranker_model(
model_name='nvidia-quality-classifier-deberta',
device='cpu'
)
# High-quality memory content
high_quality = (
"This implementation uses async/await patterns with proper error handling, "
"comprehensive logging, and follows Python best practices. The code includes "
"detailed docstrings, type hints, and handles edge cases gracefully. "
"Performance is optimized with caching and batch processing."
)
# Low-quality memory content
low_quality = "todo fix this later maybe idk lol"
# Medium-quality memory content
medium_quality = "Basic implementation that works but lacks documentation and error handling."
# Query should not matter for classifier - test with different queries
query_meaningful = "Python best practices"
query_random = "xyzabc random words 12345"
query_empty = ""
# Score high-quality content with different queries
high_score_1 = model.score_quality(query_meaningful, high_quality)
high_score_2 = model.score_quality(query_random, high_quality)
high_score_3 = model.score_quality(query_empty, high_quality)
# Score low-quality content with different queries
low_score_1 = model.score_quality(query_meaningful, low_quality)
low_score_2 = model.score_quality(query_random, low_quality)
low_score_3 = model.score_quality(query_empty, low_quality)
# Score medium-quality content
medium_score = model.score_quality(query_empty, medium_quality)
# High-quality content should score high (≥0.5)
# Note: DeBERTa is more conservative than MS-MARCO, rarely giving scores >0.7
assert high_score_1 >= 0.5, f"Expected high score ≥0.5, got {high_score_1}"
assert high_score_2 >= 0.5, f"Expected high score ≥0.5, got {high_score_2}"
assert high_score_3 >= 0.5, f"Expected high score ≥0.5, got {high_score_3}"
# Low-quality content should score low (<0.4)
assert low_score_1 < 0.4, f"Expected low score <0.4, got {low_score_1}"
assert low_score_2 < 0.4, f"Expected low score <0.4, got {low_score_2}"
assert low_score_3 < 0.4, f"Expected low score <0.4, got {low_score_3}"
# Medium-quality content should be in middle range
assert 0.4 <= medium_score <= 0.7, f"Expected medium score 0.4-0.7, got {medium_score}"
# Scores should be similar regardless of query (±0.1 tolerance)
# This validates query-independent behavior
assert abs(high_score_1 - high_score_2) < 0.1, "High quality scores vary too much with query"
assert abs(high_score_1 - high_score_3) < 0.1, "High quality scores vary too much with query"
assert abs(low_score_1 - low_score_2) < 0.1, "Low quality scores vary too much with query"
assert abs(low_score_1 - low_score_3) < 0.1, "Low quality scores vary too much with query"
# Quality ranking should be consistent
assert high_score_1 > medium_score > low_score_1, "Quality ranking incorrect"
def test_deberta_3class_output_mapping(self):
"""Test 3-class output mapping to 0-1 scale."""
model = get_onnx_ranker_model(
model_name='nvidia-quality-classifier-deberta',
device='cpu'
)
# Test various quality levels with concrete, specific content
# (DeBERTa performs better on specific technical content vs abstract descriptions)
excellent = (
"The implementation uses a sophisticated multi-tier architecture with semantic analysis, "
"pattern matching, and adaptive learning algorithms to optimize retrieval accuracy."
)
average = "The code does some processing and returns a result."
poor = "stuff things maybe later TODO"
scores = [
model.score_quality("", excellent),
model.score_quality("", average),
model.score_quality("", poor)
]
# Verify ordering (excellent > average > poor)
assert scores[0] > scores[1] > scores[2], f"Scores not ordered correctly: {scores}"
# Verify all scores in valid range [0.0, 1.0]
assert all(0.0 <= s <= 1.0 for s in scores), f"Scores out of range: {scores}"
# Verify reasonable spread (not all clustered near 0.5)
score_range = max(scores) - min(scores)
assert score_range > 0.2, f"Scores too clustered (range: {score_range})"
def test_deberta_empty_content_handling(self):
"""Test DeBERTa handles empty content gracefully."""
model = get_onnx_ranker_model(
model_name='nvidia-quality-classifier-deberta',
device='cpu'
)
# Empty content should return 0.0
assert model.score_quality("", "") == 0.0
assert model.score_quality("query", "") == 0.0
@pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO ONNX model not found")
class TestBackwardCompatibility:
"""Test backward compatibility with MS-MARCO model."""
def test_ms_marco_still_works(self):
"""Verify MS-MARCO model still initializes and works."""
model = get_onnx_ranker_model(
model_name='ms-marco-MiniLM-L-6-v2',
device='cpu'
)
assert model is not None
assert model.model_name == 'ms-marco-MiniLM-L-6-v2'
assert model.model_config['type'] == 'cross-encoder'
def test_ms_marco_relevance_scoring(self):
"""Test MS-MARCO scores query-document relevance."""
model = get_onnx_ranker_model(
model_name='ms-marco-MiniLM-L-6-v2',
device='cpu'
)
content = "Python async/await patterns for concurrent programming."
# Relevant query should score higher
relevant_query = "Python asynchronous programming"
irrelevant_query = "Java database optimization"
relevant_score = model.score_quality(relevant_query, content)
irrelevant_score = model.score_quality(irrelevant_query, content)
# MS-MARCO should give higher score to relevant query
assert relevant_score > irrelevant_score, (
f"MS-MARCO should score relevant query higher: "
f"relevant={relevant_score}, irrelevant={irrelevant_score}"
)
# Both scores should be valid
assert 0.0 <= relevant_score <= 1.0
assert 0.0 <= irrelevant_score <= 1.0
def test_ms_marco_requires_query(self):
"""Test MS-MARCO requires non-empty query."""
model = get_onnx_ranker_model(
model_name='ms-marco-MiniLM-L-6-v2',
device='cpu'
)
content = "Test content"
# Empty query should return 0.0 for cross-encoder
assert model.score_quality("", content) == 0.0
# Non-empty query should work
score = model.score_quality("test query", content)
assert 0.0 < score <= 1.0
def test_config_override_to_ms_marco(self):
"""Test users can override config back to MS-MARCO."""
import os
original_value = os.getenv('MCP_QUALITY_LOCAL_MODEL')
try:
os.environ['MCP_QUALITY_LOCAL_MODEL'] = 'ms-marco-MiniLM-L-6-v2'
config = QualityConfig.from_env()
assert config.local_model == 'ms-marco-MiniLM-L-6-v2'
finally:
# Restore original value
if original_value:
os.environ['MCP_QUALITY_LOCAL_MODEL'] = original_value
else:
os.environ.pop('MCP_QUALITY_LOCAL_MODEL', None)
@pytest.mark.benchmark
@pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported")
class TestPerformance:
"""Performance benchmarks for DeBERTa."""
def test_deberta_inference_speed(self):
"""Test DeBERTa inference completes within acceptable time."""
import time
model = get_onnx_ranker_model(
model_name='nvidia-quality-classifier-deberta',
device='cpu'
)
content = "Sample memory content for benchmarking quality scoring performance."
# Warm-up inference
model.score_quality("", content)
# Benchmark 10 inferences
start_time = time.time()
for _ in range(10):
score = model.score_quality("", content)
assert 0.0 <= score <= 1.0
elapsed_time = (time.time() - start_time) * 1000 # Convert to ms
avg_time_per_inference = elapsed_time / 10
# Target: <200ms on CPU (DeBERTa is larger than MS-MARCO)
assert avg_time_per_inference < 200, (
f"DeBERTa inference too slow: {avg_time_per_inference:.1f}ms "
f"(target: <200ms on CPU)"
)
print(f"\nDeBERTa performance: {avg_time_per_inference:.1f}ms per inference")
@pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO model not found")
def test_performance_comparison(self):
"""Compare DeBERTa vs MS-MARCO inference speed."""
import time
deberta = get_onnx_ranker_model(
model_name='nvidia-quality-classifier-deberta',
device='cpu'
)
ms_marco = get_onnx_ranker_model(
model_name='ms-marco-MiniLM-L-6-v2',
device='cpu'
)
content = "Test content for performance comparison."
query = "test query"
# Warm-up
deberta.score_quality("", content)
ms_marco.score_quality(query, content)
# Benchmark DeBERTa
start = time.time()
for _ in range(10):
deberta.score_quality("", content)
deberta_time = (time.time() - start) * 1000 / 10
# Benchmark MS-MARCO
start = time.time()
for _ in range(10):
ms_marco.score_quality(query, content)
ms_marco_time = (time.time() - start) * 1000 / 10
print(f"\nPerformance comparison:")
print(f" DeBERTa: {deberta_time:.1f}ms per inference")
print(f" MS-MARCO: {ms_marco_time:.1f}ms per inference")
print(f" Overhead: {((deberta_time / ms_marco_time - 1) * 100):.1f}%")
# DeBERTa should be slower (larger model: 450MB vs 23MB)
# Target: <10x slower than MS-MARCO (typically 6-7x on CPU, 2-3x on GPU)
# Note: Trade-off for better quality assessment and no self-matching bias
assert deberta_time < ms_marco_time * 10, (
f"DeBERTa excessively slow compared to MS-MARCO: "
f"{deberta_time:.1f}ms vs {ms_marco_time:.1f}ms"
)
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short", "-s"])