vllm_complete
Generate text completions for code and content using vLLM models. Provide prompts to produce coherent outputs with configurable parameters like temperature and token limits.
Instructions
Generate text completion using vLLM. Good for code completion and text generation.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| prompt | Yes | The prompt to complete | |
| model | No | Model to use (optional) | |
| temperature | No | Sampling temperature (0-2) | |
| max_tokens | No | Maximum tokens to generate | |
| stop | No | Stop sequences |
Implementation Reference
- src/vllm_mcp_server/server.py:97-129 (schema)Tool schema definition for vllm_complete, specifying the input parameters (prompt, model, temperature, max_tokens, stop) and their validation rules
Tool( name="vllm_complete", description="Generate text completion using vLLM. Good for code completion and text generation.", inputSchema={ "type": "object", "properties": { "prompt": { "type": "string", "description": "The prompt to complete", }, "model": { "type": "string", "description": "Model to use (optional)", }, "temperature": { "type": "number", "description": "Sampling temperature (0-2)", "default": 0.7, }, "max_tokens": { "type": "integer", "description": "Maximum tokens to generate", "default": 1024, }, "stop": { "type": "array", "items": {"type": "string"}, "description": "Stop sequences", }, }, "required": ["prompt"], }, ), - src/vllm_mcp_server/server.py:336-371 (registration)Tool registration handler that routes vllm_complete calls to the handle_complete function (line 344-345)
@app.call_tool() async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]: """Handle tool calls.""" logger.info(f"Tool called: {name} with arguments: {arguments}") try: if name == "vllm_chat": return await handle_chat(arguments) elif name == "vllm_complete": return await handle_complete(arguments) elif name == "list_models": return await list_models() elif name == "get_model_info": return await get_model_info(arguments) elif name == "vllm_status": status_text = await get_server_status_text() return [TextContent(type="text", text=status_text)] elif name == "start_vllm": return await start_vllm(arguments) elif name == "stop_vllm": return await stop_vllm(arguments) elif name == "restart_vllm": return await restart_vllm(arguments) elif name == "list_vllm_containers": return await list_vllm_containers(arguments) elif name == "get_vllm_logs": return await get_vllm_logs(arguments) elif name == "get_platform_status": return await get_platform_status(arguments) elif name == "run_benchmark": return await run_benchmark(arguments) else: return [TextContent(type="text", text=f"Unknown tool: {name}")] except Exception as e: logger.exception(f"Error in tool {name}") return [TextContent(type="text", text=f"Error: {str(e)}")] - src/vllm_mcp_server/tools/chat.py:77-136 (handler)Main handler function for vllm_complete that validates arguments, calls the VLLMClient.text_completion method, and formats the response with usage information
async def handle_complete(arguments: dict[str, Any]) -> list[TextContent]: """ Handle text completion request. Args: arguments: Dictionary containing: - prompt: The text prompt to complete - model: Optional model name to use - temperature: Optional temperature (0-2) - max_tokens: Optional maximum tokens to generate - stop: Optional stop sequences Returns: List of TextContent with the generated completion. """ prompt = arguments.get("prompt", "") if not prompt: return [TextContent(type="text", text="Error: No prompt provided")] model = arguments.get("model") temperature = arguments.get("temperature") max_tokens = arguments.get("max_tokens") stop = arguments.get("stop") extra_kwargs: dict[str, Any] = {} if stop: extra_kwargs["stop"] = stop try: async with VLLMClient() as client: response = await client.text_completion( prompt=prompt, model=model, temperature=temperature, max_tokens=max_tokens, **extra_kwargs, ) # Extract the completion choices = response.get("choices", []) if not choices: return [TextContent(type="text", text="Error: No response from model")] completion_text = choices[0].get("text", "") # Include usage info usage = response.get("usage", {}) usage_info = "" if usage: usage_info = ( f"\n\n---\n" f"Tokens: {usage.get('prompt_tokens', 0)} prompt + " f"{usage.get('completion_tokens', 0)} completion = " f"{usage.get('total_tokens', 0)} total" ) return [TextContent(type="text", text=completion_text + usage_info)] except VLLMClientError as e: return [TextContent(type="text", text=f"Error: {str(e)}")] - VLLMClient.text_completion method that sends the actual HTTP POST request to the vLLM server's /completions endpoint
async def text_completion( self, prompt: str, model: Optional[str] = None, temperature: Optional[float] = None, max_tokens: Optional[int] = None, stream: bool = False, **kwargs: Any, ) -> dict[str, Any]: """Send a text completion request.""" model = model or self.settings.model if not model: models = await self.list_models() if models: model = models[0].get("id") else: raise VLLMAPIError("No model specified and no models available", 400) payload = { "model": model, "prompt": prompt, "temperature": temperature or self.settings.default_temperature, "max_tokens": max_tokens or self.settings.default_max_tokens, "stream": stream, **kwargs, } session = await self._get_session() try: async with session.post( f"{self.base_url}/completions", headers=self.headers, json=payload, ) as response: if response.status != 200: body = await response.text() raise VLLMAPIError( f"Text completion failed: {response.status}", response.status, body, ) return await response.json() except aiohttp.ClientConnectorError as e: raise VLLMConnectionError(f"Cannot connect to vLLM server: {e}") from e