Skip to main content
Glama

generate_multimodal_response

Generate AI responses using text prompts combined with images or files from multiple providers to create multimodal content and analysis.

Instructions

Generate response from multimodal model.

        Args:
            model: Model name to use
            prompt: Text prompt
            image_urls: Optional list of image URLs
            file_paths: Optional list of file paths
            system_prompt: Optional system prompt
            max_tokens: Maximum tokens to generate
            temperature: Generation temperature
            provider: Optional provider name (openai, dashscope)

        Returns:
            Generated response text
        

Input Schema

TableJSON Schema
NameRequiredDescriptionDefault
modelYes
promptYes
image_urlsNo
file_pathsNo
system_promptNo
max_tokensNo
temperatureNo
providerNo

Implementation Reference

  • The main handler function for the 'generate_multimodal_response' tool. It is decorated with @self.server.tool(), processes input parameters, constructs a MultimodalRequest, delegates to the appropriate provider for generation, and formats the response.
    @self.server.tool()
    def generate_multimodal_response(
        model: str,
        prompt: str,
        image_urls: Optional[List[str]] = None,
        file_paths: Optional[List[str]] = None,
        system_prompt: Optional[str] = None,
        max_tokens: Optional[int] = 1000,
        temperature: Optional[float] = 0.7,
        provider: Optional[str] = None
    ) -> str:
        """Generate response from multimodal model.
    
        Args:
            model: Model name to use
            prompt: Text prompt
            image_urls: Optional list of image URLs
            file_paths: Optional list of file paths
            system_prompt: Optional system prompt
            max_tokens: Maximum tokens to generate
            temperature: Generation temperature
            provider: Optional provider name (openai, dashscope)
    
        Returns:
            Generated response text
        """
        try:
            # Auto-detect provider if not specified
            if not provider:
                if model.startswith("gpt"):
                    provider = "openai"
                elif model.startswith("qwen"):
                    provider = "dashscope"
                else:
                    provider = list(self.providers.keys())[0] if self.providers else None
    
            if not provider or provider not in self.providers:
                return f"Error: Provider '{provider}' not available"
    
            # Build multimodal request
            text_contents = [TextContent(text=prompt)]
            image_contents = []
            file_contents = []
    
            # Add image content
            if image_urls:
                for url in image_urls:
                    image_contents.append(ImageContent(
                        url=url,
                        mime_type="image/jpeg"  # Default, will be updated if needed
                    ))
    
            # Add file content
            if file_paths:
                for file_path in file_paths:
                    path = Path(file_path)
                    if path.exists():
                        import mimetypes
                        mime_type, _ = mimetypes.guess_type(file_path)
    
                        if mime_type and mime_type.startswith("image/"):
                            image_contents.append(ImageContent(
                                image_path=file_path,
                                mime_type=mime_type
                            ))
                        elif mime_type and mime_type.startswith("text/"):
                            with open(path, 'r', encoding='utf-8') as f:
                                content = f.read()
                            file_contents.append(FileContent(
                                filename=path.name,
                                text=content,
                                mime_type=mime_type
                            ))
    
            request = MultimodalRequest(
                model=model,
                text_contents=text_contents,
                image_contents=image_contents,
                file_contents=file_contents,
                system_prompt=system_prompt,
                max_tokens=max_tokens,
                temperature=temperature
            )
    
            # Generate response
            try:
                # Check if we're already in an event loop
                try:
                    loop = asyncio.get_running_loop()
                    # We're already in a loop, create a task
                    task = asyncio.create_task(
                        self.providers[provider].generate_response(request)
                    )
                    # Wait for the task to complete
                    while not task.done():
                        asyncio.sleep(0.01)
                    response = task.result()
                except RuntimeError:
                    # No running loop, create a new one
                    loop = asyncio.new_event_loop()
                    asyncio.set_event_loop(loop)
                    response = loop.run_until_complete(
                        self.providers[provider].generate_response(request)
                    )
                    loop.close()
    
                if response.error:
                    return f"Error: {response.error}"
    
                result = response.text
                if response.usage:
                    result += f"\n\n[Token usage: {response.usage}]"
    
                return result
    
            finally:
                loop.close()
    
        except Exception as e:
            logger.error(f"Error generating response: {e}")
            return f"Error: {str(e)}"
  • The _setup_tools method where the generate_multimodal_response tool is registered using the @self.server.tool() decorator.
    def _setup_tools(self):
        """Setup MCP tools."""
  • Pydantic model used internally for structuring the multimodal request passed to providers. Supports input validation and typing.
    class MultimodalRequest(BaseModel):
        """Multimodal request model."""
        model: str = Field(..., description="Model name")
        text_contents: List[TextContent] = Field(default_factory=list, description="Text content list")
        image_contents: List[ImageContent] = Field(default_factory=list, description="Image content list")
        file_contents: List[FileContent] = Field(default_factory=list, description="File content list")
        system_prompt: Optional[str] = Field(None, description="System prompt")
        max_tokens: Optional[int] = Field(1000, description="Maximum tokens to generate")
        temperature: float = Field(0.7, description="Generation temperature")
        top_p: Optional[float] = Field(None, description="Top-p sampling")
        top_k: Optional[int] = Field(None, description="Top-k sampling")
        stream: bool = Field(False, description="Whether to stream response")
        extra_params: Dict[str, Any] = Field(default_factory=dict, description="Extra model parameters")
  • Helper method in OpenAIProvider that performs the actual API call to generate multimodal responses, used by the main handler.
    async def generate_response(
        self, request: MultimodalRequest
    ) -> MultimodalResponse:
        """Generate response from OpenAI multimodal model.
    
        Args:
            request: Multimodal request containing text, images, and files
    
        Returns:
            Multimodal response
        """
        try:
            messages = self._build_messages(request)
    
            response = self.client.chat.completions.create(
                model=request.model,
                messages=messages,
                max_tokens=request.max_tokens,
                temperature=request.temperature,
                stream=False,
            )
    
            return self._parse_response(response)
    
        except openai.APIError as e:
            raise Exception(f"OpenAI API error: {e}")
        except Exception as e:
            raise Exception(f"Error generating response: {e}")

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/StanleyChanH/vllm-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server