WarpGBM MCP Service

modal_app.py•8.98 kB

""" Modal deployment configuration for WarpGBM MCP Service Deploys the FastAPI app to Modal with GPU support. Usage: modal deploy modal_app.py """ import modal # Create Modal app app = modal.App("warpgbm-mcp") # Create Modal Dict for artifact caching (persists across container instances) artifact_cache_dict = modal.Dict.from_name("artifact-cache", create_if_missing=True) # Create Modal Volume for feedback storage (persistent across deployments) feedback_volume = modal.Volume.from_name("warpgbm-feedback", create_if_missing=True) # Define container image with dependencies # Use Modal's GPU image which includes CUDA toolkit for WarpGBM's JIT compilation image = ( modal.Image.from_registry("nvidia/cuda:12.1.0-devel-ubuntu22.04", add_python="3.11") # Install build tools (gcc, g++, clang, make) needed for compiling CUDA extensions .apt_install("build-essential", "clang") .pip_install( "fastapi>=0.109.0", "uvicorn[standard]>=0.27.0", "pydantic>=2.0.0", "numpy>=1.24.0", "joblib>=1.3.0", "PyJWT>=2.8.0", "python-multipart>=0.0.6", "onnx>=1.15.0", "onnxruntime>=1.16.0", "skl2onnx>=1.16.0", "slowapi>=0.1.9", "httpx>=0.24.0", # Data processing "pandas>=2.0.0", "pyarrow>=14.0.0", # Model backends "lightgbm>=4.0.0", "scikit-learn>=1.3.0", ) # Install torch first (WarpGBM needs it to detect CUDA) .pip_install("torch", "wheel") # Set CUDA architectures for WarpGBM compilation (A10G = compute_86, also add common ones) .env({"TORCH_CUDA_ARCH_LIST": "7.5;8.0;8.6;8.9"}) # V100, A100, A10G, H100 # Now install WarpGBM with --no-build-isolation so it can compile CUDA kernels .pip_install("warpgbm", extra_options="--no-build-isolation") .add_local_dir("app", remote_path="/root/app") .add_local_dir(".well-known", remote_path="/root/.well-known") .add_local_dir("docs", remote_path="/root/docs") .add_local_dir("assets", remote_path="/root/assets") ) # GPU function for WarpGBM training (when implemented) # GPU functions for WarpGBM (training and prediction) @app.function( image=image, gpu="A10G", # Specific GPU type for WarpGBM cpu=4.0, memory=16384, # 16GB RAM timeout=600, # 10 minutes max scaledown_window=60, # Shut down after 1 minute idle (save $$$) max_containers=1, # Max 1 GPU at a time (safest for cost control) ) def warpgbm_gpu_predict(model_artifact: str, X): """ GPU-accelerated WarpGBM prediction. WarpGBM is GPU-only, so predictions must also run on GPU. Args: model_artifact: Base64-encoded gzipped joblib model X: Feature matrix (list of lists) Returns: list: Predictions """ import sys sys.path.insert(0, "/root") import numpy as np import torch import joblib import base64 import gzip import io # Deserialize model with GPU support (DON'T force CPU!) compressed_bytes = base64.b64decode(model_artifact) model_bytes = gzip.decompress(compressed_bytes) buf = io.BytesIO(model_bytes) # Load with GPU mapping (WarpGBM needs CUDA tensors) original_load = torch.load def gpu_load(*args, **kwargs): if 'map_location' not in kwargs: kwargs['map_location'] = 'cuda' # Force GPU for WarpGBM return original_load(*args, **kwargs) torch.load = gpu_load try: model = joblib.load(buf) finally: torch.load = original_load # Convert input to numpy X = np.array(X, dtype=np.float32) # Predict on GPU preds = model.predict(X) return preds.tolist() if hasattr(preds, 'tolist') else list(preds) @app.function( image=image, gpu="A10G", # Specific GPU type for WarpGBM cpu=4.0, memory=16384, # 16GB RAM timeout=600, # 10 minutes max scaledown_window=60, # Shut down after 1 minute idle (save $$$) max_containers=1, # Max 1 GPU at a time (safest for cost control) ) def warpgbm_gpu_predict_proba(model_artifact: str, X): """ GPU-accelerated WarpGBM probability prediction. WarpGBM is GPU-only, so predictions must also run on GPU. Args: model_artifact: Base64-encoded gzipped joblib model X: Feature matrix (list of lists) Returns: list: Probability predictions (n_samples, n_classes) """ import sys sys.path.insert(0, "/root") import numpy as np import torch import joblib import base64 import gzip import io # Deserialize model with GPU support (DON'T force CPU!) compressed_bytes = base64.b64decode(model_artifact) model_bytes = gzip.decompress(compressed_bytes) buf = io.BytesIO(model_bytes) # Load with GPU mapping (WarpGBM needs CUDA tensors) original_load = torch.load def gpu_load(*args, **kwargs): if 'map_location' not in kwargs: kwargs['map_location'] = 'cuda' # Force GPU for WarpGBM return original_load(*args, **kwargs) torch.load = gpu_load try: model = joblib.load(buf) finally: torch.load = original_load # Convert input to numpy X = np.array(X, dtype=np.float32) # Predict probabilities on GPU probs = model.predict_proba(X) return probs.tolist() if hasattr(probs, 'tolist') else list(probs) @app.function( image=image, gpu="A10G", # Specific GPU type for WarpGBM cpu=4.0, memory=16384, # 16GB RAM timeout=600, # 10 minutes max scaledown_window=60, # Shut down after 1 minute idle (save $$$) max_containers=1, # Max 1 GPU at a time (safest for cost control) ) def train_warpgbm_gpu(X, y, **params): """ GPU-accelerated WarpGBM training. This function is separate from the main service to: 1. Control GPU costs (only runs when explicitly called) 2. Fast scaledown (60s vs 5min) 3. Limited concurrency (MAX 1 GPU container - safest) 4. Automatic shutdown after 60s idle Cost: ~$0.0006/second = ~$2.16/hour (A10G) Max cost: $2.16/hour × 1 container = $2.16/hour (even if bombarded with requests) Args: X: Feature matrix (list of lists) y: Target labels (list) **params: WarpGBM hyperparameters Returns: str: Base64-encoded gzipped joblib model artifact """ import sys sys.path.insert(0, "/root") import numpy as np from app.utils import serialize_model_joblib from app.model_registry import registry # Convert lists to numpy arrays X = np.array(X, dtype=np.float32) y = np.array(y, dtype=np.float32 if params.get("objective") == "regression" else np.int32) try: # Import WarpGBM (GPU-accelerated) from warpgbm.core import WarpGBM # Create config and model using the registry ConfigClass = registry.get_config_class("warpgbm") config = ConfigClass(**params) model = registry.create_model(config) # Train on GPU model.fit(X, y) # Keep model on GPU - WarpGBM is GPU-only! # Serialize with GPU tensors (will be loaded back on GPU for predictions) return serialize_model_joblib(model) except ImportError as e: raise NotImplementedError( "WarpGBM library is not installed in this Modal container. " "Please add WarpGBM to the Modal image in modal_app.py. " f"Error: {e}" ) @app.function( image=image, # CPU-ONLY for main service (healthchecks, LightGBM, inference) cpu=2.0, # 2 vCPUs memory=2048, # 2GB RAM timeout=900, # 15 minutes max per request scaledown_window=300, # Keep warm for 5 minutes max_containers=10, # Max 10 concurrent requests volumes={"/data": feedback_volume}, # Mount volume for feedback storage ) @modal.asgi_app(custom_domains=["warpgbm.ai"]) def serve(): """ Serve the FastAPI app (CPU-only for cost efficiency). - Healthchecks, MCP endpoints: CPU - LightGBM training: CPU (fast enough) - WarpGBM training: Delegates to GPU function - Inference: CPU (models are already trained) Cost: ~$0.0001/second = ~$0.36/hour (CPU) """ import sys sys.path.insert(0, "/root") # Initialize the cache with Modal Dict backend from app.utils import artifact_cache # Modal Dicts are automatically available in all functions within the app artifact_cache.__init__(default_ttl_seconds=300, backend=artifact_cache_dict) # Inject GPU functions into main module import app.main app.main._gpu_training_function = train_warpgbm_gpu app.main._gpu_predict_function = warpgbm_gpu_predict app.main._gpu_predict_proba_function = warpgbm_gpu_predict_proba from app.main import app as fastapi_app return fastapi_app

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/jefferythewind/warpgbm-mcp-service'

If you have feedback or need assistance with the MCP directory API, please join our Discord server