MCP Memory Service

Overview Schema Related Servers Score Discussions

mcp-memory-service
tests

test_deberta_quality.py•13.3 KiB

"""Tests for NVIDIA DeBERTa quality classifier integration."""

import pytest
from pathlib import Path
from mcp_memory_service.quality.onnx_ranker import ONNXRankerModel, get_onnx_ranker_model
from mcp_memory_service.quality.config import QualityConfig, SUPPORTED_MODELS

# Check if ONNX models are available
DEBERTA_AVAILABLE = Path.home().joinpath(
    ".cache/mcp_memory/onnx_models/nvidia-quality-classifier-deberta/model.onnx"
).exists()
MS_MARCO_AVAILABLE = Path.home().joinpath(
    ".cache/mcp_memory/onnx_models/ms-marco-MiniLM-L-6-v2/model.onnx"
).exists()


class TestModelRegistry:
    """Test model registry configuration."""

    def test_supported_models_exist(self):
        """Verify SUPPORTED_MODELS contains both models."""
        assert 'nvidia-quality-classifier-deberta' in SUPPORTED_MODELS
        assert 'ms-marco-MiniLM-L-6-v2' in SUPPORTED_MODELS

    def test_deberta_config_structure(self):
        """Verify DeBERTa model configuration."""
        config = SUPPORTED_MODELS['nvidia-quality-classifier-deberta']
        assert config['hf_name'] == 'nvidia/quality-classifier-deberta'
        assert config['type'] == 'classifier'
        assert config['size_mb'] == 450
        assert 'input_ids' in config['inputs']
        assert 'attention_mask' in config['inputs']
        assert 'token_type_ids' not in config['inputs']  # Key difference from MS-MARCO
        assert config['output_classes'] == ['low', 'medium', 'high']

    def test_ms_marco_config_structure(self):
        """Verify MS-MARCO model configuration."""
        config = SUPPORTED_MODELS['ms-marco-MiniLM-L-6-v2']
        assert config['hf_name'] == 'cross-encoder/ms-marco-MiniLM-L-6-v2'
        assert config['type'] == 'cross-encoder'
        assert config['size_mb'] == 23
        assert 'input_ids' in config['inputs']
        assert 'attention_mask' in config['inputs']
        assert 'token_type_ids' in config['inputs']  # Required for cross-encoder
        assert config['output_classes'] is None  # Continuous score

    def test_config_default_is_deberta(self):
        """Verify DeBERTa is the default model."""
        config = QualityConfig()
        assert config.local_model == 'nvidia-quality-classifier-deberta'


@pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported")
class TestDeBERTaIntegration:
    """Test NVIDIA DeBERTa quality classifier."""

    def test_deberta_initialization(self):
        """Test DeBERTa model initialization."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )
        assert model is not None
        assert model.model_name == 'nvidia-quality-classifier-deberta'
        assert model.model_config['type'] == 'classifier'

    def test_deberta_absolute_quality_scoring(self):
        """Test DeBERTa scores absolute quality (query-independent)."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        # High-quality memory content
        high_quality = (
            "This implementation uses async/await patterns with proper error handling, "
            "comprehensive logging, and follows Python best practices. The code includes "
            "detailed docstrings, type hints, and handles edge cases gracefully. "
            "Performance is optimized with caching and batch processing."
        )

        # Low-quality memory content
        low_quality = "todo fix this later maybe idk lol"

        # Medium-quality memory content
        medium_quality = "Basic implementation that works but lacks documentation and error handling."

        # Query should not matter for classifier - test with different queries
        query_meaningful = "Python best practices"
        query_random = "xyzabc random words 12345"
        query_empty = ""

        # Score high-quality content with different queries
        high_score_1 = model.score_quality(query_meaningful, high_quality)
        high_score_2 = model.score_quality(query_random, high_quality)
        high_score_3 = model.score_quality(query_empty, high_quality)

        # Score low-quality content with different queries
        low_score_1 = model.score_quality(query_meaningful, low_quality)
        low_score_2 = model.score_quality(query_random, low_quality)
        low_score_3 = model.score_quality(query_empty, low_quality)

        # Score medium-quality content
        medium_score = model.score_quality(query_empty, medium_quality)

        # High-quality content should score high (≥0.5)
        # Note: DeBERTa is more conservative than MS-MARCO, rarely giving scores >0.7
        assert high_score_1 >= 0.5, f"Expected high score ≥0.5, got {high_score_1}"
        assert high_score_2 >= 0.5, f"Expected high score ≥0.5, got {high_score_2}"
        assert high_score_3 >= 0.5, f"Expected high score ≥0.5, got {high_score_3}"

        # Low-quality content should score low (<0.4)
        assert low_score_1 < 0.4, f"Expected low score <0.4, got {low_score_1}"
        assert low_score_2 < 0.4, f"Expected low score <0.4, got {low_score_2}"
        assert low_score_3 < 0.4, f"Expected low score <0.4, got {low_score_3}"

        # Medium-quality content should be in middle range
        assert 0.4 <= medium_score <= 0.7, f"Expected medium score 0.4-0.7, got {medium_score}"

        # Scores should be similar regardless of query (±0.1 tolerance)
        # This validates query-independent behavior
        assert abs(high_score_1 - high_score_2) < 0.1, "High quality scores vary too much with query"
        assert abs(high_score_1 - high_score_3) < 0.1, "High quality scores vary too much with query"
        assert abs(low_score_1 - low_score_2) < 0.1, "Low quality scores vary too much with query"
        assert abs(low_score_1 - low_score_3) < 0.1, "Low quality scores vary too much with query"

        # Quality ranking should be consistent
        assert high_score_1 > medium_score > low_score_1, "Quality ranking incorrect"

    def test_deberta_3class_output_mapping(self):
        """Test 3-class output mapping to 0-1 scale."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        # Test various quality levels with concrete, specific content
        # (DeBERTa performs better on specific technical content vs abstract descriptions)
        excellent = (
            "The implementation uses a sophisticated multi-tier architecture with semantic analysis, "
            "pattern matching, and adaptive learning algorithms to optimize retrieval accuracy."
        )
        average = "The code does some processing and returns a result."
        poor = "stuff things maybe later TODO"

        scores = [
            model.score_quality("", excellent),
            model.score_quality("", average),
            model.score_quality("", poor)
        ]

        # Verify ordering (excellent > average > poor)
        assert scores[0] > scores[1] > scores[2], f"Scores not ordered correctly: {scores}"

        # Verify all scores in valid range [0.0, 1.0]
        assert all(0.0 <= s <= 1.0 for s in scores), f"Scores out of range: {scores}"

        # Verify reasonable spread (not all clustered near 0.5)
        score_range = max(scores) - min(scores)
        assert score_range > 0.2, f"Scores too clustered (range: {score_range})"

    def test_deberta_empty_content_handling(self):
        """Test DeBERTa handles empty content gracefully."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        # Empty content should return 0.0
        assert model.score_quality("", "") == 0.0
        assert model.score_quality("query", "") == 0.0


@pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO ONNX model not found")
class TestBackwardCompatibility:
    """Test backward compatibility with MS-MARCO model."""

    def test_ms_marco_still_works(self):
        """Verify MS-MARCO model still initializes and works."""
        model = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )
        assert model is not None
        assert model.model_name == 'ms-marco-MiniLM-L-6-v2'
        assert model.model_config['type'] == 'cross-encoder'

    def test_ms_marco_relevance_scoring(self):
        """Test MS-MARCO scores query-document relevance."""
        model = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )

        content = "Python async/await patterns for concurrent programming."

        # Relevant query should score higher
        relevant_query = "Python asynchronous programming"
        irrelevant_query = "Java database optimization"

        relevant_score = model.score_quality(relevant_query, content)
        irrelevant_score = model.score_quality(irrelevant_query, content)

        # MS-MARCO should give higher score to relevant query
        assert relevant_score > irrelevant_score, (
            f"MS-MARCO should score relevant query higher: "
            f"relevant={relevant_score}, irrelevant={irrelevant_score}"
        )

        # Both scores should be valid
        assert 0.0 <= relevant_score <= 1.0
        assert 0.0 <= irrelevant_score <= 1.0

    def test_ms_marco_requires_query(self):
        """Test MS-MARCO requires non-empty query."""
        model = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )

        content = "Test content"

        # Empty query should return 0.0 for cross-encoder
        assert model.score_quality("", content) == 0.0

        # Non-empty query should work
        score = model.score_quality("test query", content)
        assert 0.0 < score <= 1.0

    def test_config_override_to_ms_marco(self):
        """Test users can override config back to MS-MARCO."""
        import os
        original_value = os.getenv('MCP_QUALITY_LOCAL_MODEL')

        try:
            os.environ['MCP_QUALITY_LOCAL_MODEL'] = 'ms-marco-MiniLM-L-6-v2'
            config = QualityConfig.from_env()
            assert config.local_model == 'ms-marco-MiniLM-L-6-v2'
        finally:
            # Restore original value
            if original_value:
                os.environ['MCP_QUALITY_LOCAL_MODEL'] = original_value
            else:
                os.environ.pop('MCP_QUALITY_LOCAL_MODEL', None)


@pytest.mark.benchmark
@pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported")
class TestPerformance:
    """Performance benchmarks for DeBERTa."""

    def test_deberta_inference_speed(self):
        """Test DeBERTa inference completes within acceptable time."""
        import time

        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        content = "Sample memory content for benchmarking quality scoring performance."

        # Warm-up inference
        model.score_quality("", content)

        # Benchmark 10 inferences
        start_time = time.time()
        for _ in range(10):
            score = model.score_quality("", content)
            assert 0.0 <= score <= 1.0

        elapsed_time = (time.time() - start_time) * 1000  # Convert to ms
        avg_time_per_inference = elapsed_time / 10

        # Target: <200ms on CPU (DeBERTa is larger than MS-MARCO)
        assert avg_time_per_inference < 200, (
            f"DeBERTa inference too slow: {avg_time_per_inference:.1f}ms "
            f"(target: <200ms on CPU)"
        )

        print(f"\nDeBERTa performance: {avg_time_per_inference:.1f}ms per inference")

    @pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO model not found")
    def test_performance_comparison(self):
        """Compare DeBERTa vs MS-MARCO inference speed."""
        import time

        deberta = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )
        ms_marco = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )

        content = "Test content for performance comparison."
        query = "test query"

        # Warm-up
        deberta.score_quality("", content)
        ms_marco.score_quality(query, content)

        # Benchmark DeBERTa
        start = time.time()
        for _ in range(10):
            deberta.score_quality("", content)
        deberta_time = (time.time() - start) * 1000 / 10

        # Benchmark MS-MARCO
        start = time.time()
        for _ in range(10):
            ms_marco.score_quality(query, content)
        ms_marco_time = (time.time() - start) * 1000 / 10

        print(f"\nPerformance comparison:")
        print(f"  DeBERTa:  {deberta_time:.1f}ms per inference")
        print(f"  MS-MARCO: {ms_marco_time:.1f}ms per inference")
        print(f"  Overhead: {((deberta_time / ms_marco_time - 1) * 100):.1f}%")

        # DeBERTa should be slower (larger model: 450MB vs 23MB)
        # Target: <10x slower than MS-MARCO (typically 6-7x on CPU, 2-3x on GPU)
        # Note: Trade-off for better quality assessment and no self-matching bias
        assert deberta_time < ms_marco_time * 10, (
            f"DeBERTa excessively slow compared to MS-MARCO: "
            f"{deberta_time:.1f}ms vs {ms_marco_time:.1f}ms"
        )


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short", "-s"])

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/doobidoo/mcp-memory-service'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

test_deberta_quality.py•13.3 KiB

"""Tests for NVIDIA DeBERTa quality classifier integration."""

import pytest
from pathlib import Path
from mcp_memory_service.quality.onnx_ranker import ONNXRankerModel, get_onnx_ranker_model
from mcp_memory_service.quality.config import QualityConfig, SUPPORTED_MODELS

# Check if ONNX models are available
DEBERTA_AVAILABLE = Path.home().joinpath(
    ".cache/mcp_memory/onnx_models/nvidia-quality-classifier-deberta/model.onnx"
).exists()
MS_MARCO_AVAILABLE = Path.home().joinpath(
    ".cache/mcp_memory/onnx_models/ms-marco-MiniLM-L-6-v2/model.onnx"
).exists()


class TestModelRegistry:
    """Test model registry configuration."""

    def test_supported_models_exist(self):
        """Verify SUPPORTED_MODELS contains both models."""
        assert 'nvidia-quality-classifier-deberta' in SUPPORTED_MODELS
        assert 'ms-marco-MiniLM-L-6-v2' in SUPPORTED_MODELS

    def test_deberta_config_structure(self):
        """Verify DeBERTa model configuration."""
        config = SUPPORTED_MODELS['nvidia-quality-classifier-deberta']
        assert config['hf_name'] == 'nvidia/quality-classifier-deberta'
        assert config['type'] == 'classifier'
        assert config['size_mb'] == 450
        assert 'input_ids' in config['inputs']
        assert 'attention_mask' in config['inputs']
        assert 'token_type_ids' not in config['inputs']  # Key difference from MS-MARCO
        assert config['output_classes'] == ['low', 'medium', 'high']

    def test_ms_marco_config_structure(self):
        """Verify MS-MARCO model configuration."""
        config = SUPPORTED_MODELS['ms-marco-MiniLM-L-6-v2']
        assert config['hf_name'] == 'cross-encoder/ms-marco-MiniLM-L-6-v2'
        assert config['type'] == 'cross-encoder'
        assert config['size_mb'] == 23
        assert 'input_ids' in config['inputs']
        assert 'attention_mask' in config['inputs']
        assert 'token_type_ids' in config['inputs']  # Required for cross-encoder
        assert config['output_classes'] is None  # Continuous score

    def test_config_default_is_deberta(self):
        """Verify DeBERTa is the default model."""
        config = QualityConfig()
        assert config.local_model == 'nvidia-quality-classifier-deberta'


@pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported")
class TestDeBERTaIntegration:
    """Test NVIDIA DeBERTa quality classifier."""

    def test_deberta_initialization(self):
        """Test DeBERTa model initialization."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )
        assert model is not None
        assert model.model_name == 'nvidia-quality-classifier-deberta'
        assert model.model_config['type'] == 'classifier'

    def test_deberta_absolute_quality_scoring(self):
        """Test DeBERTa scores absolute quality (query-independent)."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        # High-quality memory content
        high_quality = (
            "This implementation uses async/await patterns with proper error handling, "
            "comprehensive logging, and follows Python best practices. The code includes "
            "detailed docstrings, type hints, and handles edge cases gracefully. "
            "Performance is optimized with caching and batch processing."
        )

        # Low-quality memory content
        low_quality = "todo fix this later maybe idk lol"

        # Medium-quality memory content
        medium_quality = "Basic implementation that works but lacks documentation and error handling."

        # Query should not matter for classifier - test with different queries
        query_meaningful = "Python best practices"
        query_random = "xyzabc random words 12345"
        query_empty = ""

        # Score high-quality content with different queries
        high_score_1 = model.score_quality(query_meaningful, high_quality)
        high_score_2 = model.score_quality(query_random, high_quality)
        high_score_3 = model.score_quality(query_empty, high_quality)

        # Score low-quality content with different queries
        low_score_1 = model.score_quality(query_meaningful, low_quality)
        low_score_2 = model.score_quality(query_random, low_quality)
        low_score_3 = model.score_quality(query_empty, low_quality)

        # Score medium-quality content
        medium_score = model.score_quality(query_empty, medium_quality)

        # High-quality content should score high (≥0.5)
        # Note: DeBERTa is more conservative than MS-MARCO, rarely giving scores >0.7
        assert high_score_1 >= 0.5, f"Expected high score ≥0.5, got {high_score_1}"
        assert high_score_2 >= 0.5, f"Expected high score ≥0.5, got {high_score_2}"
        assert high_score_3 >= 0.5, f"Expected high score ≥0.5, got {high_score_3}"

        # Low-quality content should score low (<0.4)
        assert low_score_1 < 0.4, f"Expected low score <0.4, got {low_score_1}"
        assert low_score_2 < 0.4, f"Expected low score <0.4, got {low_score_2}"
        assert low_score_3 < 0.4, f"Expected low score <0.4, got {low_score_3}"

        # Medium-quality content should be in middle range
        assert 0.4 <= medium_score <= 0.7, f"Expected medium score 0.4-0.7, got {medium_score}"

        # Scores should be similar regardless of query (±0.1 tolerance)
        # This validates query-independent behavior
        assert abs(high_score_1 - high_score_2) < 0.1, "High quality scores vary too much with query"
        assert abs(high_score_1 - high_score_3) < 0.1, "High quality scores vary too much with query"
        assert abs(low_score_1 - low_score_2) < 0.1, "Low quality scores vary too much with query"
        assert abs(low_score_1 - low_score_3) < 0.1, "Low quality scores vary too much with query"

        # Quality ranking should be consistent
        assert high_score_1 > medium_score > low_score_1, "Quality ranking incorrect"

    def test_deberta_3class_output_mapping(self):
        """Test 3-class output mapping to 0-1 scale."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        # Test various quality levels with concrete, specific content
        # (DeBERTa performs better on specific technical content vs abstract descriptions)
        excellent = (
            "The implementation uses a sophisticated multi-tier architecture with semantic analysis, "
            "pattern matching, and adaptive learning algorithms to optimize retrieval accuracy."
        )
        average = "The code does some processing and returns a result."
        poor = "stuff things maybe later TODO"

        scores = [
            model.score_quality("", excellent),
            model.score_quality("", average),
            model.score_quality("", poor)
        ]

        # Verify ordering (excellent > average > poor)
        assert scores[0] > scores[1] > scores[2], f"Scores not ordered correctly: {scores}"

        # Verify all scores in valid range [0.0, 1.0]
        assert all(0.0 <= s <= 1.0 for s in scores), f"Scores out of range: {scores}"

        # Verify reasonable spread (not all clustered near 0.5)
        score_range = max(scores) - min(scores)
        assert score_range > 0.2, f"Scores too clustered (range: {score_range})"

    def test_deberta_empty_content_handling(self):
        """Test DeBERTa handles empty content gracefully."""
        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        # Empty content should return 0.0
        assert model.score_quality("", "") == 0.0
        assert model.score_quality("query", "") == 0.0


@pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO ONNX model not found")
class TestBackwardCompatibility:
    """Test backward compatibility with MS-MARCO model."""

    def test_ms_marco_still_works(self):
        """Verify MS-MARCO model still initializes and works."""
        model = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )
        assert model is not None
        assert model.model_name == 'ms-marco-MiniLM-L-6-v2'
        assert model.model_config['type'] == 'cross-encoder'

    def test_ms_marco_relevance_scoring(self):
        """Test MS-MARCO scores query-document relevance."""
        model = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )

        content = "Python async/await patterns for concurrent programming."

        # Relevant query should score higher
        relevant_query = "Python asynchronous programming"
        irrelevant_query = "Java database optimization"

        relevant_score = model.score_quality(relevant_query, content)
        irrelevant_score = model.score_quality(irrelevant_query, content)

        # MS-MARCO should give higher score to relevant query
        assert relevant_score > irrelevant_score, (
            f"MS-MARCO should score relevant query higher: "
            f"relevant={relevant_score}, irrelevant={irrelevant_score}"
        )

        # Both scores should be valid
        assert 0.0 <= relevant_score <= 1.0
        assert 0.0 <= irrelevant_score <= 1.0

    def test_ms_marco_requires_query(self):
        """Test MS-MARCO requires non-empty query."""
        model = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )

        content = "Test content"

        # Empty query should return 0.0 for cross-encoder
        assert model.score_quality("", content) == 0.0

        # Non-empty query should work
        score = model.score_quality("test query", content)
        assert 0.0 < score <= 1.0

    def test_config_override_to_ms_marco(self):
        """Test users can override config back to MS-MARCO."""
        import os
        original_value = os.getenv('MCP_QUALITY_LOCAL_MODEL')

        try:
            os.environ['MCP_QUALITY_LOCAL_MODEL'] = 'ms-marco-MiniLM-L-6-v2'
            config = QualityConfig.from_env()
            assert config.local_model == 'ms-marco-MiniLM-L-6-v2'
        finally:
            # Restore original value
            if original_value:
                os.environ['MCP_QUALITY_LOCAL_MODEL'] = original_value
            else:
                os.environ.pop('MCP_QUALITY_LOCAL_MODEL', None)


@pytest.mark.benchmark
@pytest.mark.skipif(not DEBERTA_AVAILABLE, reason="DeBERTa ONNX model not exported")
class TestPerformance:
    """Performance benchmarks for DeBERTa."""

    def test_deberta_inference_speed(self):
        """Test DeBERTa inference completes within acceptable time."""
        import time

        model = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )

        content = "Sample memory content for benchmarking quality scoring performance."

        # Warm-up inference
        model.score_quality("", content)

        # Benchmark 10 inferences
        start_time = time.time()
        for _ in range(10):
            score = model.score_quality("", content)
            assert 0.0 <= score <= 1.0

        elapsed_time = (time.time() - start_time) * 1000  # Convert to ms
        avg_time_per_inference = elapsed_time / 10

        # Target: <200ms on CPU (DeBERTa is larger than MS-MARCO)
        assert avg_time_per_inference < 200, (
            f"DeBERTa inference too slow: {avg_time_per_inference:.1f}ms "
            f"(target: <200ms on CPU)"
        )

        print(f"\nDeBERTa performance: {avg_time_per_inference:.1f}ms per inference")

    @pytest.mark.skipif(not MS_MARCO_AVAILABLE, reason="MS-MARCO model not found")
    def test_performance_comparison(self):
        """Compare DeBERTa vs MS-MARCO inference speed."""
        import time

        deberta = get_onnx_ranker_model(
            model_name='nvidia-quality-classifier-deberta',
            device='cpu'
        )
        ms_marco = get_onnx_ranker_model(
            model_name='ms-marco-MiniLM-L-6-v2',
            device='cpu'
        )

        content = "Test content for performance comparison."
        query = "test query"

        # Warm-up
        deberta.score_quality("", content)
        ms_marco.score_quality(query, content)

        # Benchmark DeBERTa
        start = time.time()
        for _ in range(10):
            deberta.score_quality("", content)
        deberta_time = (time.time() - start) * 1000 / 10

        # Benchmark MS-MARCO
        start = time.time()
        for _ in range(10):
            ms_marco.score_quality(query, content)
        ms_marco_time = (time.time() - start) * 1000 / 10

        print(f"\nPerformance comparison:")
        print(f"  DeBERTa:  {deberta_time:.1f}ms per inference")
        print(f"  MS-MARCO: {ms_marco_time:.1f}ms per inference")
        print(f"  Overhead: {((deberta_time / ms_marco_time - 1) * 100):.1f}%")

        # DeBERTa should be slower (larger model: 450MB vs 23MB)
        # Target: <10x slower than MS-MARCO (typically 6-7x on CPU, 2-3x on GPU)
        # Note: Trade-off for better quality assessment and no self-matching bias
        assert deberta_time < ms_marco_time * 10, (
            f"DeBERTa excessively slow compared to MS-MARCO: "
            f"{deberta_time:.1f}ms vs {ms_marco_time:.1f}ms"
        )


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short", "-s"])