vllm_chat
Send chat messages to a vLLM server for multi-turn conversations with configurable model parameters and token limits.
Instructions
Send a chat message to the vLLM server. Supports multi-turn conversations.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| messages | Yes | List of messages in the conversation | |
| model | No | Model to use (optional, uses default if not specified) | |
| temperature | No | Sampling temperature (0-2) | |
| max_tokens | No | Maximum tokens to generate |
Implementation Reference
- src/vllm_mcp_server/server.py:54-96 (registration)Tool registration for 'vllm_chat' defining its schema with messages, model, temperature, and max_tokens parameters
Tool( name="vllm_chat", description="Send a chat message to the vLLM server. Supports multi-turn conversations.", inputSchema={ "type": "object", "properties": { "messages": { "type": "array", "description": "List of messages in the conversation", "items": { "type": "object", "properties": { "role": { "type": "string", "enum": ["system", "user", "assistant"], "description": "The role of the message sender", }, "content": { "type": "string", "description": "The content of the message", }, }, "required": ["role", "content"], }, }, "model": { "type": "string", "description": "Model to use (optional, uses default if not specified)", }, "temperature": { "type": "number", "description": "Sampling temperature (0-2)", "default": 0.7, }, "max_tokens": { "type": "integer", "description": "Maximum tokens to generate", "default": 1024, }, }, "required": ["messages"], }, ), - src/vllm_mcp_server/tools/chat.py:10-74 (handler)The handle_chat function that executes the vllm_chat tool logic: validates messages, extracts parameters, calls VLLMClient.chat_completion, and formats the response with usage info
async def handle_chat(arguments: dict[str, Any]) -> list[TextContent]: """ Handle chat completion request. Args: arguments: Dictionary containing: - messages: List of message objects with 'role' and 'content' - model: Optional model name to use - temperature: Optional temperature (0-2) - max_tokens: Optional maximum tokens to generate - stream: Whether to stream the response (default: False) Returns: List of TextContent with the assistant's response. """ messages = arguments.get("messages", []) if not messages: return [TextContent(type="text", text="Error: No messages provided")] # Validate message format for msg in messages: if not isinstance(msg, dict) or "role" not in msg or "content" not in msg: return [ TextContent( type="text", text="Error: Each message must have 'role' and 'content' fields", ) ] model = arguments.get("model") temperature = arguments.get("temperature") max_tokens = arguments.get("max_tokens") try: async with VLLMClient() as client: response = await client.chat_completion( messages=messages, model=model, temperature=temperature, max_tokens=max_tokens, stream=False, ) # Extract the assistant's message choices = response.get("choices", []) if not choices: return [TextContent(type="text", text="Error: No response from model")] assistant_message = choices[0].get("message", {}).get("content", "") # Include usage info usage = response.get("usage", {}) usage_info = "" if usage: usage_info = ( f"\n\n---\n" f"Tokens: {usage.get('prompt_tokens', 0)} prompt + " f"{usage.get('completion_tokens', 0)} completion = " f"{usage.get('total_tokens', 0)} total" ) return [TextContent(type="text", text=assistant_message + usage_info)] except VLLMClientError as e: return [TextContent(type="text", text=f"Error: {str(e)}")] - src/vllm_mcp_server/server.py:57-95 (schema)Input schema definition for vllm_chat tool specifying required 'messages' array and optional parameters (model, temperature, max_tokens)
inputSchema={ "type": "object", "properties": { "messages": { "type": "array", "description": "List of messages in the conversation", "items": { "type": "object", "properties": { "role": { "type": "string", "enum": ["system", "user", "assistant"], "description": "The role of the message sender", }, "content": { "type": "string", "description": "The content of the message", }, }, "required": ["role", "content"], }, }, "model": { "type": "string", "description": "Model to use (optional, uses default if not specified)", }, "temperature": { "type": "number", "description": "Sampling temperature (0-2)", "default": 0.7, }, "max_tokens": { "type": "integer", "description": "Maximum tokens to generate", "default": 1024, }, }, "required": ["messages"], }, - VLLMClient.chat_completion method that makes the actual OpenAI-compatible API call to the vLLM server's /chat/completions endpoint
async def chat_completion( self, messages: list[dict[str, str]], model: Optional[str] = None, temperature: Optional[float] = None, max_tokens: Optional[int] = None, stream: bool = False, **kwargs: Any, ) -> dict[str, Any] | AsyncIterator[dict[str, Any]]: """Send a chat completion request.""" model = model or self.settings.model if not model: # Try to get the first available model models = await self.list_models() if models: model = models[0].get("id") else: raise VLLMAPIError("No model specified and no models available", 400) payload = { "model": model, "messages": messages, "temperature": temperature or self.settings.default_temperature, "max_tokens": max_tokens or self.settings.default_max_tokens, "stream": stream, **kwargs, } session = await self._get_session() try: if stream: return self._stream_chat_completion(session, payload) else: return await self._send_chat_completion(session, payload) except aiohttp.ClientConnectorError as e: raise VLLMConnectionError(f"Cannot connect to vLLM server: {e}") from e async def _send_chat_completion( self, session: aiohttp.ClientSession, payload: dict ) -> dict[str, Any]: """Send non-streaming chat completion request.""" async with session.post( f"{self.base_url}/chat/completions", headers=self.headers, json=payload, ) as response: if response.status != 200: body = await response.text() raise VLLMAPIError( f"Chat completion failed: {response.status}", response.status, body, ) return await response.json() - src/vllm_mcp_server/server.py:342-343 (registration)Tool call handler registration that routes 'vllm_chat' tool calls to the handle_chat function
if name == "vllm_chat": return await handle_chat(arguments)