recommend
Analyzes model size and hardware specifications to suggest optimal quantization format and bit width for efficient deployment.
Instructions
Recommend best quantization format and bit width for a model.
Analyzes the model size and your hardware (GPU VRAM, Apple Silicon, system RAM) to suggest the optimal format (GGUF/GPTQ/AWQ) and bit width (2-8). Ranked recommendations with use-case explanations.
Args: model: HuggingFace model ID (e.g. 'meta-llama/Llama-3.1-8B-Instruct') or local path to a model directory.
Returns: Ranked recommendations with format, bits, reasoning, and use cases.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| model | Yes |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
No arguments | |||
Implementation Reference
- mcp_turboquant/server.py:157-180 (handler)Handler for the 'recommend' tool, which takes a model ID and returns recommended quantization formats.
def recommend(model: str) -> dict[str, Any]: """Recommend best quantization format and bit width for a model. Analyzes the model size and your hardware (GPU VRAM, Apple Silicon, system RAM) to suggest the optimal format (GGUF/GPTQ/AWQ) and bit width (2-8). Ranked recommendations with use-case explanations. Args: model: HuggingFace model ID (e.g. 'meta-llama/Llama-3.1-8B-Instruct') or local path to a model directory. Returns: Ranked recommendations with format, bits, reasoning, and use cases. """ model_info = get_model_info(model) if not model_info.get("found"): return { "error": f"Model not found: {model_info.get('error', 'unknown')}", "model": model, } deps = check_dependencies() return recommend_format(model_info, deps) - mcp_turboquant/server.py:156-157 (registration)Registration of the 'recommend' tool using the @mcp.tool() decorator.
@mcp.tool() def recommend(model: str) -> dict[str, Any]: - mcp_turboquant/model_info.py:218-320 (helper)Helper function that performs the actual logic for generating recommendations.
def recommend_format( model_info: dict[str, Any], deps: dict[str, Any] ) -> list[dict[str, Any]]: """Recommend the best quantization format based on hardware and model.""" model_size_gb = model_info.get("size_bytes", 0) / 1e9 params = model_info.get("params_estimate", 0) params_b = params / 1e9 if params else 0 has_cuda = deps.get("cuda", False) gpu_name = deps.get("gpu_name", "") gpu_mem = deps.get("gpu_mem_gb", 0) has_mps = deps.get("mps", False) system_ram = deps.get("system_ram_gb", 0) or get_system_ram_gb() hardware = {} if has_cuda: hardware["accelerator"] = f"CUDA GPU: {gpu_name} ({gpu_mem}GB VRAM)" elif has_mps: hardware["accelerator"] = f"Apple Silicon (MPS) — {system_ram}GB unified memory" else: hardware["accelerator"] = "None (CPU only)" hardware["ram_gb"] = system_ram recommendations: list[dict[str, Any]] = [] # Estimate quantized sizes size_4bit = model_size_gb / 4 if model_size_gb else params_b * 0.5 size_8bit = model_size_gb / 2 if model_size_gb else params_b * 1.0 source = model_info.get("source", "MODEL") def _make_rec(rank, label, fmt, bits, reason, use_case): return { "rank": rank, "label": label, "format": fmt, "bits": bits, "reason": reason, "use_case": use_case, "command": f'quantize(model="{source}", format="{fmt.lower()}", bits={bits})', } if has_cuda and gpu_mem > 0: if size_4bit * 1.2 <= gpu_mem: recommendations.append(_make_rec( 1, "BEST", "AWQ", 4, f"Best GPU throughput. 4-bit model (~{size_4bit:.1f}GB) fits in {gpu_mem}GB VRAM.", "Production GPU serving with vLLM or TGI", )) recommendations.append(_make_rec( 2, "ALSO GOOD", "GPTQ", 4, "Alternative GPU format. Wider tool support than AWQ.", "GPU serving when AWQ isn't available", )) recommendations.append(_make_rec( 3, "ALTERNATIVE", "GGUF", 4, "Universal format. Works with Ollama, LM Studio, llama.cpp.", "Local use, sharing, or CPU fallback", )) elif size_4bit * 1.2 > gpu_mem and size_4bit <= system_ram: recommendations.append(_make_rec( 1, "BEST", "GGUF", 4, f"Model too large for {gpu_mem}GB VRAM. GGUF supports CPU+GPU split.", "CPU+GPU hybrid inference via llama.cpp", )) if params_b > 13: recommendations.append(_make_rec( 2, "ALSO GOOD", "GGUF", 2, f"Aggressive compression to fit in {gpu_mem}GB VRAM. Quality trade-off.", "When VRAM is tight and you need GPU acceleration", )) else: recommendations.append(_make_rec( 1, "BEST", "GGUF", 2, "Model requires aggressive compression for your hardware.", "Maximum compression for large models", )) elif has_mps: recommendations.append(_make_rec( 1, "BEST", "GGUF", 4, "Best format for Apple Silicon. llama.cpp has Metal acceleration.", "Ollama or LM Studio on Mac", )) if size_8bit <= system_ram * 0.7: recommendations.append(_make_rec( 2, "ALSO GOOD", "GGUF", 8, f"Higher quality, still fits in {system_ram}GB unified memory.", "Maximum quality on Mac", )) else: recommendations.append(_make_rec( 1, "BEST", "GGUF", 4, "Only format that runs well on CPU. Use with Ollama or llama.cpp.", "CPU inference via Ollama or llama.cpp", )) if params_b <= 3 and size_8bit <= system_ram * 0.5: recommendations.append(_make_rec( 2, "ALSO GOOD", "GGUF", 8, f"Small model ({model_info.get('params_human', '')}). Higher quality fits in RAM.", "Better quality for small models on CPU", ))