"""
Qwen Video Understanding MCP Server
An MCP server that uses Qwen3-VL deployed on Modal to analyze videos and images.
Provides video understanding capabilities to Claude Code and other AI agents.
Features:
- Analyze videos via URL (extracts frames and analyzes)
- Analyze images via URL
- Hours-long video support with full recall
- Second-level timestamp grounding
- Multiple analysis modes (summary, detailed, transcript, Q&A)
- Supports both local Modal deployment and cloud endpoints
"""
import os
import asyncio
import httpx
from typing import Optional, Annotated
from datetime import timedelta
from mcp.server.fastmcp import FastMCP
from pydantic import Field
# Initialize the MCP server
mcp = FastMCP("qwen-video-understanding")
# Configuration
DEFAULT_MODAL_WORKSPACE = os.environ.get("MODAL_WORKSPACE", "adam-31541")
DEFAULT_MODAL_APP = os.environ.get("MODAL_APP", "qwen-video-understanding")
# Timeout settings
REQUEST_TIMEOUT = 300 # 5 minutes for video processing
def get_endpoint_urls() -> dict:
"""Get the Modal endpoint URLs from environment or defaults."""
workspace = os.environ.get("MODAL_WORKSPACE", DEFAULT_MODAL_WORKSPACE)
app = os.environ.get("MODAL_APP", DEFAULT_MODAL_APP)
# Allow full URL override
image_url = os.environ.get(
"QWEN_IMAGE_ENDPOINT",
f"https://{workspace}--{app}-videomodel-analyze-image.modal.run"
)
video_url = os.environ.get(
"QWEN_VIDEO_ENDPOINT",
f"https://{workspace}--{app}-videomodel-analyze-video.modal.run"
)
return {
"image": image_url,
"video": video_url
}
def format_duration(seconds: float) -> str:
"""Format duration in human-readable format."""
td = timedelta(seconds=int(seconds))
hours, remainder = divmod(td.seconds, 3600)
minutes, secs = divmod(remainder, 60)
if td.days > 0:
return f"{td.days}d {hours}h {minutes}m"
elif hours > 0:
return f"{hours}h {minutes}m {secs}s"
elif minutes > 0:
return f"{minutes}m {secs}s"
else:
return f"{secs}s"
@mcp.tool()
async def analyze_video(
video_url: Annotated[str, Field(description="URL of the video to analyze (must be publicly accessible)")],
question: Annotated[str, Field(description="Question or prompt about the video")] = "Describe what happens in this video in detail.",
max_frames: Annotated[int, Field(description="Maximum number of frames to extract (1-64)")] = 32,
max_tokens: Annotated[int, Field(description="Maximum tokens in response")] = 1024
) -> dict:
"""
Analyze a video using Qwen3-VL vision-language model.
The video must be accessible via a public URL. The model will:
1. Download the video
2. Extract key frames (up to max_frames)
3. Analyze the frames with your question
4. Provide timestamp-grounded responses when applicable
Examples:
- "What happens in this video?"
- "Summarize the main events with timestamps"
- "What products are shown?"
- "At what timestamp does the speaker mention X?"
- "What is being discussed or demonstrated?"
"""
endpoints = get_endpoint_urls()
payload = {
"video_url": video_url,
"question": question,
"max_frames": min(max(1, max_frames), 64),
"max_tokens": max_tokens
}
try:
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
response = await client.post(
endpoints["video"],
json=payload,
headers={"Content-Type": "application/json"}
)
response.raise_for_status()
result = response.json()
if "error" in result:
return {
"status": "error",
"error": result["error"],
"video_url": video_url
}
return {
"status": "success",
"video_url": video_url,
"question": question,
"frames_analyzed": result.get("frames_analyzed", "unknown"),
"processing_time": f"{result.get('processing_time', 'unknown')}s",
"analysis": result.get("answer", "No analysis returned")
}
except httpx.TimeoutException:
return {
"status": "error",
"error": "Request timed out. Video may be too large or server is busy.",
"video_url": video_url,
"suggestion": "Try again or use a shorter video"
}
except httpx.HTTPStatusError as e:
return {
"status": "error",
"error": f"HTTP error: {e.response.status_code}",
"video_url": video_url
}
except Exception as e:
return {
"status": "error",
"error": str(e),
"video_url": video_url
}
@mcp.tool()
async def analyze_image(
image_url: Annotated[str, Field(description="URL of the image to analyze (must be publicly accessible)")],
question: Annotated[str, Field(description="Question or prompt about the image")] = "Describe this image in detail.",
max_tokens: Annotated[int, Field(description="Maximum tokens in response")] = 512
) -> dict:
"""
Analyze an image using Qwen2.5-VL vision-language model.
The image must be accessible via a public URL.
Examples:
- "What's in this image?"
- "Describe the scene"
- "What text is visible?"
- "Identify any people or objects"
- "What is the mood or atmosphere?"
"""
endpoints = get_endpoint_urls()
payload = {
"image_url": image_url,
"question": question,
"max_tokens": max_tokens
}
try:
async with httpx.AsyncClient(timeout=REQUEST_TIMEOUT) as client:
response = await client.post(
endpoints["image"],
json=payload,
headers={"Content-Type": "application/json"}
)
response.raise_for_status()
result = response.json()
if "error" in result:
return {
"status": "error",
"error": result["error"],
"image_url": image_url
}
return {
"status": "success",
"image_url": image_url,
"question": question,
"processing_time": f"{result.get('processing_time', 'unknown')}s",
"analysis": result.get("answer", "No analysis returned")
}
except httpx.TimeoutException:
return {
"status": "error",
"error": "Request timed out",
"image_url": image_url
}
except httpx.HTTPStatusError as e:
return {
"status": "error",
"error": f"HTTP error: {e.response.status_code}",
"image_url": image_url
}
except Exception as e:
return {
"status": "error",
"error": str(e),
"image_url": image_url
}
@mcp.tool()
async def summarize_video(
video_url: Annotated[str, Field(description="URL of the video to summarize")],
style: Annotated[str, Field(description="Summary style: 'brief' (1-2 sentences), 'standard' (1-2 paragraphs), 'detailed' (comprehensive)")] = "standard"
) -> dict:
"""
Generate a summary of a video.
Styles:
- brief: 1-2 sentence overview
- standard: 1-2 paragraph summary with key points
- detailed: Comprehensive analysis with timeline
"""
prompts = {
"brief": "Provide a 1-2 sentence summary of what happens in this video.",
"standard": "Summarize this video in 1-2 paragraphs. Include the main topic, key events or points, and the overall message.",
"detailed": """Provide a comprehensive analysis of this video:
1. Main topic/theme
2. Key events in chronological order
3. Important visual elements
4. Any speech or text content
5. Overall takeaway"""
}
prompt = prompts.get(style, prompts["standard"])
max_tokens = {"brief": 128, "standard": 512, "detailed": 1024}.get(style, 512)
return await analyze_video(
video_url=video_url,
question=prompt,
max_frames=16 if style == "detailed" else 12,
max_tokens=max_tokens
)
@mcp.tool()
async def extract_video_text(
video_url: Annotated[str, Field(description="URL of the video")],
) -> dict:
"""
Extract and transcribe any visible text or speech from a video.
Useful for:
- Reading on-screen text, titles, captions
- Transcribing spoken content
- Extracting text from presentations or documents shown in video
"""
return await analyze_video(
video_url=video_url,
question="""Extract all text content from this video:
1. Any on-screen text, titles, or captions
2. Transcribe any spoken words
3. Text from documents, slides, or signs shown
Format as a clear list with context for each piece of text.""",
max_frames=24,
max_tokens=1024
)
@mcp.tool()
async def video_qa(
video_url: Annotated[str, Field(description="URL of the video")],
question: Annotated[str, Field(description="Your specific question about the video")]
) -> dict:
"""
Ask a specific question about a video's content.
Examples:
- "How many people appear in this video?"
- "What color is the car?"
- "What is the speaker's main argument?"
- "What products are being demonstrated?"
- "At what point does the action begin?"
"""
return await analyze_video(
video_url=video_url,
question=f"Answer this question about the video: {question}\n\nProvide a clear, direct answer based on what you can see and hear.",
max_frames=16,
max_tokens=512
)
@mcp.tool()
async def compare_video_frames(
video_url: Annotated[str, Field(description="URL of the video")],
comparison_prompt: Annotated[str, Field(description="What to compare across the video")] = "Describe how the scene changes throughout the video"
) -> dict:
"""
Analyze changes and progression across a video.
Useful for:
- Before/after comparisons
- Tracking movement or changes
- Understanding progression of events
- Analyzing tutorials or how-to videos
"""
return await analyze_video(
video_url=video_url,
question=f"""Analyze the progression and changes in this video:
{comparison_prompt}
Describe:
1. The initial state/scene
2. Key changes or transitions
3. The final state/outcome
4. Notable differences between beginning and end""",
max_frames=24,
max_tokens=1024
)
@mcp.tool()
def check_endpoint_status() -> dict:
"""
Check the configuration and status of the Modal endpoints.
Returns the configured endpoint URLs and connection status.
"""
endpoints = get_endpoint_urls()
return {
"status": "configured",
"endpoints": {
"image_analysis": endpoints["image"],
"video_analysis": endpoints["video"]
},
"configuration": {
"workspace": os.environ.get("MODAL_WORKSPACE", DEFAULT_MODAL_WORKSPACE),
"app": os.environ.get("MODAL_APP", DEFAULT_MODAL_APP)
},
"notes": [
"Endpoints are serverless and may have cold start delay on first request",
"Video analysis extracts frames and analyzes them with Qwen2.5-VL",
"Large videos may take longer to process"
]
}
@mcp.tool()
def list_capabilities() -> dict:
"""List the capabilities of this video understanding server."""
return {
"model": "Qwen3-VL-8B-Instruct",
"deployment": "Modal (serverless GPU)",
"capabilities": [
"Video summarization",
"Video Q&A",
"Image analysis",
"Text extraction from video",
"Scene change detection",
"Multi-frame analysis",
"Timestamp grounding",
"Long video understanding",
"32-language OCR"
],
"supported_formats": {
"video": ["mp4", "webm", "mov", "avi", "mkv"],
"image": ["jpg", "jpeg", "png", "gif", "webp", "bmp"]
},
"limits": {
"max_frames": 64,
"context": "256K tokens (expandable to 1M)",
"video_source": "URL only (must be publicly accessible)",
"recommended_video_length": "Hours-long supported with full recall"
},
"tips": [
"Use publicly accessible URLs (cloud storage, CDN)",
"Qwen3-VL handles long videos with full context",
"Be specific in your questions for better answers",
"Use summarize_video for quick overviews",
"Use video_qa for specific questions",
"Ask for timestamps - Qwen3-VL has precise grounding"
]
}
# Resource for server info
@mcp.resource("resource://server-info")
def get_server_info() -> dict:
"""Get information about this MCP server's capabilities."""
return {
"name": "Qwen Video Understanding MCP",
"version": "2.0.0",
"description": "Analyze videos and images using Qwen3-VL on Modal",
"model": "Qwen3-VL-8B-Instruct",
"backend": "Modal serverless GPU (A100/H100)",
"features": [
"Hours-long video with full recall",
"Second-level timestamp grounding",
"256K context (expandable to 1M)",
"32-language OCR support"
],
"tools": [
"analyze_video - Analyze video with custom prompt",
"analyze_image - Analyze image with custom prompt",
"summarize_video - Generate video summary",
"extract_video_text - Extract text/speech from video",
"video_qa - Ask questions about video",
"compare_video_frames - Analyze changes over time",
"check_endpoint_status - Check API configuration",
"list_capabilities - List all capabilities"
]
}
def main():
"""Main entry point."""
mcp.run(transport="stdio")
if __name__ == "__main__":
main()