quantize
Convert HuggingFace models to GGUF, GPTQ, or AWQ formats for efficient deployment. Specify bit width and target platform to reduce model size while maintaining performance.
Instructions
Quantize a HuggingFace model to GGUF, GPTQ, or AWQ format.
This is a heavy operation that downloads and compresses the model. Requires appropriate backend dependencies to be installed.
Args: model: HuggingFace model ID (e.g. 'meta-llama/Llama-3.1-8B-Instruct') or local path to a model directory. format: Output format — gguf, gptq, or awq. Default: gguf. bits: Quantization bit width — 2, 3, 4, 5, or 8. Default: 4. output_dir: Directory to write output files. Default: temp directory. target: Deployment target. ollama/llamacpp/lmstudio force GGUF, vllm forces AWQ.
Returns: Quantization result with file paths, sizes, and compression ratios.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| model | Yes | ||
| format | No | gguf | |
| bits | No | ||
| output_dir | No | ||
| target | No |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
No arguments | |||
Implementation Reference
- mcp_turboquant/server.py:183-290 (handler)The MCP tool registration for 'quantize' and its handler implementation in server.py.
@mcp.tool() def quantize( model: str, format: Literal["gguf", "gptq", "awq"] = "gguf", bits: Literal[2, 3, 4, 5, 8] = 4, output_dir: str | None = None, target: Literal["ollama", "vllm", "llamacpp", "lmstudio"] | None = None, ) -> dict[str, Any]: """Quantize a HuggingFace model to GGUF, GPTQ, or AWQ format. This is a heavy operation that downloads and compresses the model. Requires appropriate backend dependencies to be installed. Args: model: HuggingFace model ID (e.g. 'meta-llama/Llama-3.1-8B-Instruct') or local path to a model directory. format: Output format — gguf, gptq, or awq. Default: gguf. bits: Quantization bit width — 2, 3, 4, 5, or 8. Default: 4. output_dir: Directory to write output files. Default: temp directory. target: Deployment target. ollama/llamacpp/lmstudio force GGUF, vllm forces AWQ. Returns: Quantization result with file paths, sizes, and compression ratios. """ # Resolve target overrides fmt = format.lower() if target: target = target.lower() if target == "ollama": fmt = "gguf" elif target == "vllm": fmt = "awq" elif target in ("llamacpp", "lmstudio"): fmt = "gguf" if fmt not in SUPPORTED_FORMATS: return { "error": f"Unsupported format '{fmt}'. Use one of: {SUPPORTED_FORMATS}", } if bits not in SUPPORTED_BITS: return { "error": f"Unsupported bit width {bits}. Use one of: {SUPPORTED_BITS}", } # Get model info for the report model_info = get_model_info(model) if not model_info.get("found"): return { "error": f"Model not found: {model_info.get('error', 'unknown')}", "model": model, } # Set up output directory if not output_dir: model_slug = model.replace("/", "-").replace(".", "-") output_dir = os.path.join( tempfile.gettempdir(), "turboquant", f"{model_slug}-{fmt}-{bits}bit" ) os.makedirs(output_dir, exist_ok=True) # Run quantization result = quantize_model(model, fmt, bits, output_dir) # Build response response = { "model": model, "architecture": model_info.get("arch", "unknown"), "parameters": model_info.get("params_human", "unknown"), "original_size": model_info.get("size_human", "unknown"), "target_bits": bits, "format": fmt, "theoretical_compression": f"{estimate_compression(16, bits):.1f}x", } if result["success"]: response["success"] = True response["output_file"] = result["file"] response["output_size"] = result.get("size_human", "unknown") response["output_size_bytes"] = result.get("size", 0) original_bytes = model_info.get("size_bytes", 0) if original_bytes and result.get("size"): actual = original_bytes / result["size"] response["actual_compression"] = f"{actual:.1f}x" if result.get("quant_type"): response["quant_type"] = result["quant_type"] # Generate Ollama Modelfile if target is ollama if target == "ollama" and fmt == "gguf": modelfile_path = generate_ollama_modelfile( result["file"], model_info, output_dir ) model_name = model.split("/")[-1].lower().replace(".", "-") quant_type = result.get("quant_type", "Q4_K_M") response["ollama"] = { "modelfile": modelfile_path, "import_command": f"cd {output_dir} && ollama create {model_name}-{quant_type.lower()} -f Modelfile", "run_command": f"ollama run {model_name}-{quant_type.lower()}", } else: response["success"] = False response["error"] = result.get("error", "Unknown error") if result.get("install_cmd"): response["install_cmd"] = result["install_cmd"] return response - mcp_turboquant/quantize.py:278-309 (handler)The dispatch function 'quantize_model' which coordinates the quantization backends.
def quantize_model( model_id: str, fmt: str, bits: int, output_dir: str ) -> dict[str, Any]: """Dispatch quantization to the correct backend. Args: model_id: HuggingFace model ID or local path. fmt: One of 'gguf', 'gptq', 'awq'. bits: Quantization bit width (2, 3, 4, 5, or 8). output_dir: Directory to write output files. Returns: Result dict with success status and file info. """ if fmt not in SUPPORTED_FORMATS: return { "success": False, "error": f"Unsupported format '{fmt}'. Use one of: {SUPPORTED_FORMATS}", } if bits not in SUPPORTED_BITS: return { "success": False, "error": f"Unsupported bit width {bits}. Use one of: {SUPPORTED_BITS}", } dispatch = { "gguf": quantize_gguf, "gptq": quantize_gptq, "awq": quantize_awq, } return dispatch[fmt](model_id, bits, output_dir) - mcp_turboquant/quantize.py:39-134 (handler)Implementation of GGUF quantization backend.
def quantize_gguf(model_id: str, bits: int, output_dir: str) -> dict[str, Any]: """Quantize model to GGUF format using llama.cpp. Tries multiple methods in order: 1. llama-cpp-python convert + llama-quantize binary 2. convert_hf_to_gguf.py from llama.cpp source """ quant_type = GGUF_QUANT_TYPES.get(bits, "Q4_K_M") output_file = os.path.join(output_dir, f"model-{quant_type}.gguf") os.makedirs(output_dir, exist_ok=True) # Method 1: Try llama-cpp-python convert + llama-quantize try: fp16_file = os.path.join(output_dir, "model-fp16.gguf") cmd_convert = [ sys.executable, "-m", "llama_cpp.convert", "--outfile", fp16_file, "--outtype", "f16", model_id, ] result = subprocess.run( cmd_convert, capture_output=True, text=True, timeout=3600 ) if result.returncode == 0 and os.path.exists(fp16_file): cmd_quant = ["llama-quantize", fp16_file, output_file, quant_type] result = subprocess.run( cmd_quant, capture_output=True, text=True, timeout=3600 ) if result.returncode == 0 and os.path.exists(output_file): os.remove(fp16_file) return { "success": True, "file": output_file, "size": os.path.getsize(output_file), "size_human": format_size(os.path.getsize(output_file)), "format": "gguf", "quant_type": quant_type, "bits": bits, } except (FileNotFoundError, subprocess.TimeoutExpired): pass # Method 2: Try convert_hf_to_gguf.py from llama.cpp try: convert_script = shutil.which("convert_hf_to_gguf.py") if not convert_script: for candidate in [ os.path.expanduser("~/llama.cpp/convert_hf_to_gguf.py"), "/opt/llama.cpp/convert_hf_to_gguf.py", ]: if os.path.exists(candidate): convert_script = candidate break if convert_script: cmd = [ sys.executable, convert_script, model_id, "--outfile", output_file, "--outtype", quant_type.lower(), ] result = subprocess.run( cmd, capture_output=True, text=True, timeout=3600 ) if result.returncode == 0 and os.path.exists(output_file): return { "success": True, "file": output_file, "size": os.path.getsize(output_file), "size_human": format_size(os.path.getsize(output_file)), "format": "gguf", "quant_type": quant_type, "bits": bits, } except Exception: pass return { "success": False, "format": "gguf", "bits": bits, "error": ( "GGUF quantization requires llama.cpp tools. " "Install: pip install llama-cpp-python, or build llama.cpp from source." ), "install_cmd": "pip install llama-cpp-python", } - mcp_turboquant/quantize.py:137-213 (handler)Implementation of GPTQ quantization backend.
def quantize_gptq(model_id: str, bits: int, output_dir: str) -> dict[str, Any]: """Quantize model using GPTQ via auto-gptq. Requires: torch, transformers, auto-gptq, datasets Uses c4 calibration data (128 samples, 2048 max length). """ os.makedirs(output_dir, exist_ok=True) try: from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig from transformers import AutoTokenizer except ImportError: return { "success": False, "format": "gptq", "bits": bits, "error": "GPTQ requires: pip install auto-gptq transformers datasets torch", "install_cmd": "pip install auto-gptq transformers datasets torch", } try: tokenizer = AutoTokenizer.from_pretrained(model_id) quantize_config = BaseQuantizeConfig( bits=bits, group_size=128, damp_percent=0.1, desc_act=False, ) model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config) # Prepare calibration data from c4 from datasets import load_dataset dataset = load_dataset("allenai/c4", "en", split="train", streaming=True) calibration_data = [] for i, example in enumerate(dataset): if i >= 128: break tokenized = tokenizer( example["text"], return_tensors="pt", truncation=True, max_length=2048, ) calibration_data.append(tokenized.input_ids) model.quantize(calibration_data) output_path = os.path.join(output_dir, f"model-gptq-{bits}bit") model.save_quantized(output_path) tokenizer.save_pretrained(output_path) total_size = sum( os.path.getsize(os.path.join(output_path, f)) for f in os.listdir(output_path) if f.endswith((".safetensors", ".bin")) ) return { "success": True, "file": output_path, "size": total_size, "size_human": format_size(total_size), "format": "gptq", "bits": bits, "group_size": 128, } except Exception as e: return { "success": False, "format": "gptq", "bits": bits, "error": str(e), } - mcp_turboquant/quantize.py:216-275 (handler)Implementation of AWQ quantization backend.
def quantize_awq(model_id: str, bits: int, output_dir: str) -> dict[str, Any]: """Quantize model using AWQ via autoawq. Requires: torch, transformers, autoawq Uses GEMM kernel with group_size=128. """ os.makedirs(output_dir, exist_ok=True) try: from awq import AutoAWQForCausalLM from transformers import AutoTokenizer except ImportError: return { "success": False, "format": "awq", "bits": bits, "error": "AWQ requires: pip install autoawq transformers torch", "install_cmd": "pip install autoawq transformers torch", } try: model = AutoAWQForCausalLM.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": bits, "version": "GEMM", } model.quantize(tokenizer, quant_config=quant_config) output_path = os.path.join(output_dir, f"model-awq-{bits}bit") model.save_quantized(output_path) tokenizer.save_pretrained(output_path) total_size = sum( os.path.getsize(os.path.join(output_path, f)) for f in os.listdir(output_path) if f.endswith((".safetensors", ".bin")) ) return { "success": True, "file": output_path, "size": total_size, "size_human": format_size(total_size), "format": "awq", "bits": bits, "group_size": 128, } except Exception as e: return { "success": False, "format": "awq", "bits": bits, "error": str(e), }