generate_multimodal_response
Generate AI responses using text prompts combined with images or files from multiple providers to create multimodal content and analysis.
Instructions
Generate response from multimodal model.
Args:
model: Model name to use
prompt: Text prompt
image_urls: Optional list of image URLs
file_paths: Optional list of file paths
system_prompt: Optional system prompt
max_tokens: Maximum tokens to generate
temperature: Generation temperature
provider: Optional provider name (openai, dashscope)
Returns:
Generated response text
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| model | Yes | ||
| prompt | Yes | ||
| image_urls | No | ||
| file_paths | No | ||
| system_prompt | No | ||
| max_tokens | No | ||
| temperature | No | ||
| provider | No |
Implementation Reference
- src/vllm_mcp/server.py:131-252 (handler)The main handler function for the 'generate_multimodal_response' tool. It is decorated with @self.server.tool(), processes input parameters, constructs a MultimodalRequest, delegates to the appropriate provider for generation, and formats the response.@self.server.tool() def generate_multimodal_response( model: str, prompt: str, image_urls: Optional[List[str]] = None, file_paths: Optional[List[str]] = None, system_prompt: Optional[str] = None, max_tokens: Optional[int] = 1000, temperature: Optional[float] = 0.7, provider: Optional[str] = None ) -> str: """Generate response from multimodal model. Args: model: Model name to use prompt: Text prompt image_urls: Optional list of image URLs file_paths: Optional list of file paths system_prompt: Optional system prompt max_tokens: Maximum tokens to generate temperature: Generation temperature provider: Optional provider name (openai, dashscope) Returns: Generated response text """ try: # Auto-detect provider if not specified if not provider: if model.startswith("gpt"): provider = "openai" elif model.startswith("qwen"): provider = "dashscope" else: provider = list(self.providers.keys())[0] if self.providers else None if not provider or provider not in self.providers: return f"Error: Provider '{provider}' not available" # Build multimodal request text_contents = [TextContent(text=prompt)] image_contents = [] file_contents = [] # Add image content if image_urls: for url in image_urls: image_contents.append(ImageContent( url=url, mime_type="image/jpeg" # Default, will be updated if needed )) # Add file content if file_paths: for file_path in file_paths: path = Path(file_path) if path.exists(): import mimetypes mime_type, _ = mimetypes.guess_type(file_path) if mime_type and mime_type.startswith("image/"): image_contents.append(ImageContent( image_path=file_path, mime_type=mime_type )) elif mime_type and mime_type.startswith("text/"): with open(path, 'r', encoding='utf-8') as f: content = f.read() file_contents.append(FileContent( filename=path.name, text=content, mime_type=mime_type )) request = MultimodalRequest( model=model, text_contents=text_contents, image_contents=image_contents, file_contents=file_contents, system_prompt=system_prompt, max_tokens=max_tokens, temperature=temperature ) # Generate response try: # Check if we're already in an event loop try: loop = asyncio.get_running_loop() # We're already in a loop, create a task task = asyncio.create_task( self.providers[provider].generate_response(request) ) # Wait for the task to complete while not task.done(): asyncio.sleep(0.01) response = task.result() except RuntimeError: # No running loop, create a new one loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) response = loop.run_until_complete( self.providers[provider].generate_response(request) ) loop.close() if response.error: return f"Error: {response.error}" result = response.text if response.usage: result += f"\n\n[Token usage: {response.usage}]" return result finally: loop.close() except Exception as e: logger.error(f"Error generating response: {e}") return f"Error: {str(e)}"
- src/vllm_mcp/server.py:128-129 (registration)The _setup_tools method where the generate_multimodal_response tool is registered using the @self.server.tool() decorator.def _setup_tools(self): """Setup MCP tools."""
- src/vllm_mcp/models.py:50-63 (schema)Pydantic model used internally for structuring the multimodal request passed to providers. Supports input validation and typing.class MultimodalRequest(BaseModel): """Multimodal request model.""" model: str = Field(..., description="Model name") text_contents: List[TextContent] = Field(default_factory=list, description="Text content list") image_contents: List[ImageContent] = Field(default_factory=list, description="Image content list") file_contents: List[FileContent] = Field(default_factory=list, description="File content list") system_prompt: Optional[str] = Field(None, description="System prompt") max_tokens: Optional[int] = Field(1000, description="Maximum tokens to generate") temperature: float = Field(0.7, description="Generation temperature") top_p: Optional[float] = Field(None, description="Top-p sampling") top_k: Optional[int] = Field(None, description="Top-k sampling") stream: bool = Field(False, description="Whether to stream response") extra_params: Dict[str, Any] = Field(default_factory=dict, description="Extra model parameters")
- Helper method in OpenAIProvider that performs the actual API call to generate multimodal responses, used by the main handler.async def generate_response( self, request: MultimodalRequest ) -> MultimodalResponse: """Generate response from OpenAI multimodal model. Args: request: Multimodal request containing text, images, and files Returns: Multimodal response """ try: messages = self._build_messages(request) response = self.client.chat.completions.create( model=request.model, messages=messages, max_tokens=request.max_tokens, temperature=request.temperature, stream=False, ) return self._parse_response(response) except openai.APIError as e: raise Exception(f"OpenAI API error: {e}") except Exception as e: raise Exception(f"Error generating response: {e}")