evaluate

Run perplexity evaluation to assess quantized model quality. Lower perplexity indicates better performance; outputs a rating from EXCELLENT to POOR.

Instructions

Run perplexity evaluation on a quantized model.

Measures model quality after quantization using perplexity scoring. Lower perplexity = better quality. Includes a quality assessment (EXCELLENT/GOOD/FAIR/DEGRADED/POOR).

Args: model_path: Path to the quantized model file (GGUF) or directory (GPTQ/AWQ). format: Format of the quantized model. One of 'gguf', 'gptq', 'awq'. bits: Bit width used during quantization (for quality context).

Returns: Perplexity score, quality assessment, and evaluation metadata.

Input Schema

TableJSON Schema

Name	Required	Default
`model_path`	Yes
`format`	No	gguf
`bits`	No

Output Schema

TableJSON Schema

Name	Required	Description	Default
No arguments

Implementation Reference

mcp_turboquant/evaluate.py:173-226 (handler)

Main handler function evaluate_model() that dispatches to format-specific evaluation and adds quality assessment.

def evaluate_model(
    model_path: str, fmt: str, bits: int
) -> dict[str, Any]:
    """Run perplexity evaluation on a quantized model.

    Args:
        model_path: Path to the quantized model file or directory.
        fmt: Format of the model ('gguf', 'gptq', or 'awq').
        bits: Bit width used for quantization.

    Returns:
        Result dict with perplexity score and quality assessment.
    """
    if fmt == "gguf":
        result = evaluate_gguf(model_path)
    elif fmt in ("gptq", "awq"):
        result = evaluate_transformers(model_path, fmt)
    else:
        return {
            "success": False,
            "error": f"Evaluation not supported for format '{fmt}'.",
        }

    # Add quality assessment if we got a perplexity score
    if result.get("success") and result.get("perplexity"):
        ppl = result["perplexity"]
        if ppl < 10:
            result["quality"] = "EXCELLENT"
            result["assessment"] = "Minimal quality loss from quantization."
        elif ppl < 20:
            result["quality"] = "GOOD"
            result["assessment"] = "Acceptable quality for most use cases."
        elif ppl < 50:
            result["quality"] = "FAIR"
            result["assessment"] = (
                f"Some quality degradation at {bits}-bit. "
                f"Consider using higher bits."
            )
        elif ppl < 100:
            result["quality"] = "DEGRADED"
            result["assessment"] = (
                f"Significant quality loss at {bits}-bit. "
                f"Recommend {min(bits + 1, 8)}-bit or higher."
            )
        else:
            result["quality"] = "POOR"
            result["assessment"] = (
                "Severe quality loss. Model may produce incoherent output. "
                "Use higher bit quantization."
            )

    result["format"] = fmt
    result["bits"] = bits
    return result

mcp_turboquant/evaluate.py:11-110 (handler)

GGUF model evaluation using llama-perplexity binary or llama-cpp-python library.

def evaluate_gguf(model_path: str) -> dict[str, Any]:
    """Evaluate GGUF model perplexity.

    Tries in order:
    1. llama-perplexity binary (from llama.cpp)
    2. llama-cpp-python library
    """
    # Method 1: llama-perplexity binary
    llama_perplexity = shutil.which("llama-perplexity") or shutil.which("perplexity")
    if llama_perplexity:
        try:
            cmd = [
                llama_perplexity,
                "-m",
                model_path,
                "-f",
                "wikitext-2-raw/wiki.test.raw",
                "--ctx-size",
                "512",
                "--chunks",
                "20",
            ]
            result = subprocess.run(
                cmd, capture_output=True, text=True, timeout=600
            )
            if result.returncode == 0:
                for line in result.stdout.split("\n"):
                    if "perplexity" in line.lower() and "=" in line:
                        try:
                            ppl = float(line.split("=")[-1].strip().split()[0])
                            return {
                                "success": True,
                                "perplexity": round(ppl, 2),
                                "method": "llama.cpp",
                                "dataset": "wikitext-2",
                            }
                        except (ValueError, IndexError):
                            pass
        except (subprocess.TimeoutExpired, FileNotFoundError):
            pass

    # Method 2: llama-cpp-python
    try:
        from llama_cpp import Llama

        llm = Llama(model_path=model_path, n_ctx=512, verbose=False)

        test_texts = [
            "The quick brown fox jumps over the lazy dog. This is a standard test sentence used to evaluate language model quality.",
            "In machine learning, quantization refers to the process of reducing the number of bits that represent a number.",
            "The Transformer architecture has become the dominant paradigm in natural language processing and computer vision.",
            "Large language models have demonstrated remarkable capabilities in text generation and reasoning tasks.",
            "Neural networks consist of layers of interconnected nodes that process information using learned weights.",
        ]

        total_loss = 0.0
        total_tokens = 0
        for text in test_texts:
            result = llm.create_completion(
                text, max_tokens=1, logprobs=1, echo=True
            )
            if "choices" in result and result["choices"]:
                logprobs = result["choices"][0].get("logprobs", {})
                if logprobs and logprobs.get("token_logprobs"):
                    token_lps = [
                        lp
                        for lp in logprobs["token_logprobs"]
                        if lp is not None
                    ]
                    if token_lps:
                        total_loss += -sum(token_lps)
                        total_tokens += len(token_lps)

        if total_tokens > 0:
            avg_nll = total_loss / total_tokens
            ppl = math.exp(avg_nll)
            return {
                "success": True,
                "perplexity": round(ppl, 2),
                "method": "llama-cpp-python",
                "tokens_evaluated": total_tokens,
                "dataset": "built-in test passages",
            }

    except ImportError:
        pass
    except Exception as e:
        return {
            "success": False,
            "error": f"GGUF evaluation error: {e}",
        }

    return {
        "success": False,
        "error": (
            "Cannot evaluate GGUF model. "
            "Install llama-cpp-python: pip install llama-cpp-python"
        ),
        "install_cmd": "pip install llama-cpp-python",
    }

mcp_turboquant/evaluate.py:113-170 (handler)

GPTQ/AWQ model evaluation using transformers library with perplexity calculation.

def evaluate_transformers(model_path: str, fmt: str) -> dict[str, Any]:
    """Evaluate GPTQ/AWQ model perplexity using transformers."""
    try:
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer
    except ImportError:
        return {
            "success": False,
            "error": (
                "Evaluation requires transformers + torch. "
                "Install: pip install transformers torch"
            ),
            "install_cmd": "pip install transformers torch",
        }

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForCausalLM.from_pretrained(
            model_path, device_map="auto", torch_dtype=torch.float16
        )
        model.eval()

        test_texts = [
            "The quick brown fox jumps over the lazy dog. This is a standard test sentence.",
            "In machine learning, quantization reduces the number of bits that represent a number.",
            "The Transformer architecture has become the dominant paradigm in natural language processing.",
            "Large language models have demonstrated remarkable capabilities in text generation.",
            "Neural networks consist of layers of interconnected nodes that process information.",
        ]

        total_loss = 0.0
        total_tokens = 0

        with torch.no_grad():
            for text in test_texts:
                inputs = tokenizer(text, return_tensors="pt").to(model.device)
                outputs = model(**inputs, labels=inputs["input_ids"])
                loss = outputs.loss.item()
                num_tokens = inputs["input_ids"].shape[1]
                total_loss += loss * num_tokens
                total_tokens += num_tokens

        avg_nll = total_loss / total_tokens
        ppl = math.exp(avg_nll)

        return {
            "success": True,
            "perplexity": round(ppl, 2),
            "method": f"transformers ({fmt.upper()})",
            "tokens_evaluated": total_tokens,
            "dataset": "built-in test passages",
        }

    except Exception as e:
        return {
            "success": False,
            "error": f"Evaluation error: {e}",
        }

mcp_turboquant/server.py:292-319 (registration)

MCP tool registration via @mcp.tool() decorator for the 'evaluate' tool, with docstring, parameter schema, and call to evaluate_model().

@mcp.tool()
def evaluate(
    model_path: str,
    format: str = "gguf",
    bits: int = 4,
) -> dict[str, Any]:
    """Run perplexity evaluation on a quantized model.

    Measures model quality after quantization using perplexity scoring.
    Lower perplexity = better quality. Includes a quality assessment
    (EXCELLENT/GOOD/FAIR/DEGRADED/POOR).

    Args:
        model_path: Path to the quantized model file (GGUF) or directory
                    (GPTQ/AWQ).
        format: Format of the quantized model. One of 'gguf', 'gptq', 'awq'.
        bits: Bit width used during quantization (for quality context).

    Returns:
        Perplexity score, quality assessment, and evaluation metadata.
    """
    if not os.path.exists(model_path):
        return {
            "success": False,
            "error": f"Model path does not exist: {model_path}",
        }

    return evaluate_model(model_path, format.lower(), bits)

mcp-turboquant

evaluate

Instructions

Input Schema

Output Schema

Implementation Reference

Tool Definition Quality

Other Tools

Latest Blog Posts

MCP directory API