/**
* Pollinations Vision Service
*
* Functions and schemas for vision/image analysis using the Pollinations API
*/
import { createMCPResponse, createTextContent } from '../utils/coreUtils.js';
import { z } from 'zod';
// Constants
const VISION_API_BASE_URL = 'https://text.pollinations.ai/openai';
/**
* Analyzes an image using vision models through the Pollinations OpenAI-compatible API
*
* @param {Object} params - The parameters for image analysis
* @param {string} params.imageUrl - URL of the image to analyze
* @param {string} [params.prompt="Describe this image"] - Question or instruction about the image
* @param {string} [params.model="gpt-4o"] - Vision model to use for analysis
* @param {number} [params.maxTokens=1000] - Maximum tokens in response
* @param {number} [params.temperature=0.7] - Temperature for response generation
* @returns {Promise<Object>} - MCP response object with the analysis
*/
async function analyzeImageFromUrl(params) {
const {
imageUrl,
prompt = "Describe this image",
model = "gpt-4o",
maxTokens = 1000,
temperature = 0.7
} = params;
if (!imageUrl || typeof imageUrl !== 'string') {
throw new Error('Image URL is required and must be a string');
}
const requestBody = {
model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image_url", image_url: { url: imageUrl } }
]
}
],
max_tokens: maxTokens,
temperature
};
try {
const response = await fetch(VISION_API_BASE_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`Failed to analyze image: ${response.statusText}`);
}
const result = await response.json();
const analysisText = result.choices?.[0]?.message?.content || 'No analysis available';
return createMCPResponse([
createTextContent(`Image Analysis:\n\n${analysisText}\n\nModel: ${model}\nImage URL: ${imageUrl}`)
]);
} catch (error) {
console.error('Error analyzing image:', error);
throw error;
}
}
/**
* Analyzes a base64-encoded image using vision models
*
* @param {Object} params - The parameters for image analysis
* @param {string} params.imageData - Base64-encoded image data
* @param {string} [params.mimeType="image/jpeg"] - MIME type of the image
* @param {string} [params.prompt="Describe this image"] - Question or instruction about the image
* @param {string} [params.model="gpt-4o"] - Vision model to use for analysis
* @param {number} [params.maxTokens=1000] - Maximum tokens in response
* @param {number} [params.temperature=0.7] - Temperature for response generation
* @returns {Promise<Object>} - MCP response object with the analysis
*/
async function analyzeImageFromData(params) {
const {
imageData,
mimeType = "image/jpeg",
prompt = "Describe this image",
model = "gpt-4o",
maxTokens = 1000,
temperature = 0.7
} = params;
if (!imageData || typeof imageData !== 'string') {
throw new Error('Image data is required and must be a base64 string');
}
const dataUrl = `data:${mimeType};base64,${imageData}`;
const requestBody = {
model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image_url", image_url: { url: dataUrl } }
]
}
],
max_tokens: maxTokens,
temperature
};
try {
const response = await fetch(VISION_API_BASE_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`Failed to analyze image: ${response.statusText}`);
}
const result = await response.json();
const analysisText = result.choices?.[0]?.message?.content || 'No analysis available';
return createMCPResponse([
createTextContent(`Image Analysis:\n\n${analysisText}\n\nModel: ${model}\nMIME Type: ${mimeType}`)
]);
} catch (error) {
console.error('Error analyzing image:', error);
throw error;
}
}
/**
* Compares two images and describes their similarities and differences
*
* @param {Object} params - The parameters for image comparison
* @param {string} params.imageUrl1 - URL of the first image
* @param {string} params.imageUrl2 - URL of the second image
* @param {string} [params.prompt="Compare these two images and describe their similarities and differences"] - Comparison instruction
* @param {string} [params.model="gpt-4o"] - Vision model to use for comparison
* @param {number} [params.maxTokens=1500] - Maximum tokens in response
* @param {number} [params.temperature=0.7] - Temperature for response generation
* @returns {Promise<Object>} - MCP response object with the comparison
*/
async function compareImages(params) {
const {
imageUrl1,
imageUrl2,
prompt = "Compare these two images and describe their similarities and differences",
model = "gpt-4o",
maxTokens = 1500,
temperature = 0.7
} = params;
if (!imageUrl1 || typeof imageUrl1 !== 'string') {
throw new Error('First image URL is required and must be a string');
}
if (!imageUrl2 || typeof imageUrl2 !== 'string') {
throw new Error('Second image URL is required and must be a string');
}
const requestBody = {
model,
messages: [
{
role: "user",
content: [
{ type: "text", text: prompt },
{ type: "image_url", image_url: { url: imageUrl1 } },
{ type: "image_url", image_url: { url: imageUrl2 } }
]
}
],
max_tokens: maxTokens,
temperature
};
try {
const response = await fetch(VISION_API_BASE_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`Failed to compare images: ${response.statusText}`);
}
const result = await response.json();
const comparisonText = result.choices?.[0]?.message?.content || 'No comparison available';
return createMCPResponse([
createTextContent(`Image Comparison:\n\n${comparisonText}\n\nModel: ${model}\nImage 1: ${imageUrl1}\nImage 2: ${imageUrl2}`)
]);
} catch (error) {
console.error('Error comparing images:', error);
throw error;
}
}
/**
* Extract text from an image using OCR capabilities
*
* @param {Object} params - The parameters for text extraction
* @param {string} params.imageUrl - URL of the image to extract text from
* @param {string} [params.model="gpt-4o"] - Vision model to use for OCR
* @param {number} [params.maxTokens=2000] - Maximum tokens in response
* @returns {Promise<Object>} - MCP response object with extracted text
*/
async function extractTextFromImage(params) {
const {
imageUrl,
model = "gpt-4o",
maxTokens = 2000
} = params;
if (!imageUrl || typeof imageUrl !== 'string') {
throw new Error('Image URL is required and must be a string');
}
const requestBody = {
model,
messages: [
{
role: "user",
content: [
{ type: "text", text: "Extract all text from this image. Return only the extracted text, maintaining the original formatting and structure as much as possible." },
{ type: "image_url", image_url: { url: imageUrl } }
]
}
],
max_tokens: maxTokens,
temperature: 0.1
};
try {
const response = await fetch(VISION_API_BASE_URL, {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`Failed to extract text from image: ${response.statusText}`);
}
const result = await response.json();
const extractedText = result.choices?.[0]?.message?.content || 'No text found';
return createMCPResponse([
createTextContent(`Extracted Text:\n\n${extractedText}\n\nModel: ${model}\nImage URL: ${imageUrl}`)
]);
} catch (error) {
console.error('Error extracting text from image:', error);
throw error;
}
}
/**
* Export tools as complete arrays ready to be passed to server.tool()
*/
export const visionTools = [
[
'analyzeImageFromUrl',
'Analyze an image from a URL using vision AI models',
{
imageUrl: z.string().describe('URL of the image to analyze'),
prompt: z.string().optional().describe('Question or instruction about the image (default: "Describe this image")'),
model: z.string().optional().describe('Vision model to use for analysis (default: "gpt-4o")'),
maxTokens: z.number().optional().describe('Maximum tokens in response (default: 1000)'),
temperature: z.number().optional().describe('Temperature for response generation (default: 0.7)')
},
analyzeImageFromUrl
],
[
'analyzeImageFromData',
'Analyze a base64-encoded image using vision AI models',
{
imageData: z.string().describe('Base64-encoded image data'),
mimeType: z.string().optional().describe('MIME type of the image (default: "image/jpeg")'),
prompt: z.string().optional().describe('Question or instruction about the image (default: "Describe this image")'),
model: z.string().optional().describe('Vision model to use for analysis (default: "gpt-4o")'),
maxTokens: z.number().optional().describe('Maximum tokens in response (default: 1000)'),
temperature: z.number().optional().describe('Temperature for response generation (default: 0.7)')
},
analyzeImageFromData
],
[
'compareImages',
'Compare two images and describe their similarities and differences',
{
imageUrl1: z.string().describe('URL of the first image'),
imageUrl2: z.string().describe('URL of the second image'),
prompt: z.string().optional().describe('Comparison instruction (default: "Compare these two images and describe their similarities and differences")'),
model: z.string().optional().describe('Vision model to use for comparison (default: "gpt-4o")'),
maxTokens: z.number().optional().describe('Maximum tokens in response (default: 1500)'),
temperature: z.number().optional().describe('Temperature for response generation (default: 0.7)')
},
compareImages
],
[
'extractTextFromImage',
'Extract text from an image using OCR capabilities',
{
imageUrl: z.string().describe('URL of the image to extract text from'),
model: z.string().optional().describe('Vision model to use for OCR (default: "gpt-4o")'),
maxTokens: z.number().optional().describe('Maximum tokens in response (default: 2000)')
},
extractTextFromImage
]
];