Skip to main content
Glama

vllm_complete

Generate text completions for code and content using vLLM models. Provide prompts to produce coherent outputs with configurable parameters like temperature and token limits.

Instructions

Generate text completion using vLLM. Good for code completion and text generation.

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
promptYesThe prompt to complete
modelNoModel to use (optional)
temperatureNoSampling temperature (0-2)
max_tokensNoMaximum tokens to generate
stopNoStop sequences

Implementation Reference

  • Tool schema definition for vllm_complete, specifying the input parameters (prompt, model, temperature, max_tokens, stop) and their validation rules
    Tool(
        name="vllm_complete",
        description="Generate text completion using vLLM. Good for code completion and text generation.",
        inputSchema={
            "type": "object",
            "properties": {
                "prompt": {
                    "type": "string",
                    "description": "The prompt to complete",
                },
                "model": {
                    "type": "string",
                    "description": "Model to use (optional)",
                },
                "temperature": {
                    "type": "number",
                    "description": "Sampling temperature (0-2)",
                    "default": 0.7,
                },
                "max_tokens": {
                    "type": "integer",
                    "description": "Maximum tokens to generate",
                    "default": 1024,
                },
                "stop": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "Stop sequences",
                },
            },
            "required": ["prompt"],
        },
    ),
  • Tool registration handler that routes vllm_complete calls to the handle_complete function (line 344-345)
    @app.call_tool()
    async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]:
        """Handle tool calls."""
        logger.info(f"Tool called: {name} with arguments: {arguments}")
    
        try:
            if name == "vllm_chat":
                return await handle_chat(arguments)
            elif name == "vllm_complete":
                return await handle_complete(arguments)
            elif name == "list_models":
                return await list_models()
            elif name == "get_model_info":
                return await get_model_info(arguments)
            elif name == "vllm_status":
                status_text = await get_server_status_text()
                return [TextContent(type="text", text=status_text)]
            elif name == "start_vllm":
                return await start_vllm(arguments)
            elif name == "stop_vllm":
                return await stop_vllm(arguments)
            elif name == "restart_vllm":
                return await restart_vllm(arguments)
            elif name == "list_vllm_containers":
                return await list_vllm_containers(arguments)
            elif name == "get_vllm_logs":
                return await get_vllm_logs(arguments)
            elif name == "get_platform_status":
                return await get_platform_status(arguments)
            elif name == "run_benchmark":
                return await run_benchmark(arguments)
            else:
                return [TextContent(type="text", text=f"Unknown tool: {name}")]
        except Exception as e:
            logger.exception(f"Error in tool {name}")
            return [TextContent(type="text", text=f"Error: {str(e)}")]
  • Main handler function for vllm_complete that validates arguments, calls the VLLMClient.text_completion method, and formats the response with usage information
    async def handle_complete(arguments: dict[str, Any]) -> list[TextContent]:
        """
        Handle text completion request.
    
        Args:
            arguments: Dictionary containing:
                - prompt: The text prompt to complete
                - model: Optional model name to use
                - temperature: Optional temperature (0-2)
                - max_tokens: Optional maximum tokens to generate
                - stop: Optional stop sequences
    
        Returns:
            List of TextContent with the generated completion.
        """
        prompt = arguments.get("prompt", "")
        if not prompt:
            return [TextContent(type="text", text="Error: No prompt provided")]
    
        model = arguments.get("model")
        temperature = arguments.get("temperature")
        max_tokens = arguments.get("max_tokens")
        stop = arguments.get("stop")
    
        extra_kwargs: dict[str, Any] = {}
        if stop:
            extra_kwargs["stop"] = stop
    
        try:
            async with VLLMClient() as client:
                response = await client.text_completion(
                    prompt=prompt,
                    model=model,
                    temperature=temperature,
                    max_tokens=max_tokens,
                    **extra_kwargs,
                )
    
                # Extract the completion
                choices = response.get("choices", [])
                if not choices:
                    return [TextContent(type="text", text="Error: No response from model")]
    
                completion_text = choices[0].get("text", "")
    
                # Include usage info
                usage = response.get("usage", {})
                usage_info = ""
                if usage:
                    usage_info = (
                        f"\n\n---\n"
                        f"Tokens: {usage.get('prompt_tokens', 0)} prompt + "
                        f"{usage.get('completion_tokens', 0)} completion = "
                        f"{usage.get('total_tokens', 0)} total"
                    )
    
                return [TextContent(type="text", text=completion_text + usage_info)]
    
        except VLLMClientError as e:
            return [TextContent(type="text", text=f"Error: {str(e)}")]
  • VLLMClient.text_completion method that sends the actual HTTP POST request to the vLLM server's /completions endpoint
    async def text_completion(
        self,
        prompt: str,
        model: Optional[str] = None,
        temperature: Optional[float] = None,
        max_tokens: Optional[int] = None,
        stream: bool = False,
        **kwargs: Any,
    ) -> dict[str, Any]:
        """Send a text completion request."""
        model = model or self.settings.model
        if not model:
            models = await self.list_models()
            if models:
                model = models[0].get("id")
            else:
                raise VLLMAPIError("No model specified and no models available", 400)
    
        payload = {
            "model": model,
            "prompt": prompt,
            "temperature": temperature or self.settings.default_temperature,
            "max_tokens": max_tokens or self.settings.default_max_tokens,
            "stream": stream,
            **kwargs,
        }
    
        session = await self._get_session()
        try:
            async with session.post(
                f"{self.base_url}/completions",
                headers=self.headers,
                json=payload,
            ) as response:
                if response.status != 200:
                    body = await response.text()
                    raise VLLMAPIError(
                        f"Text completion failed: {response.status}",
                        response.status,
                        body,
                    )
                return await response.json()
        except aiohttp.ClientConnectorError as e:
            raise VLLMConnectionError(f"Cannot connect to vLLM server: {e}") from e

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/micytao/vllm-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server