omni_video_preview
Generates a filmstrip PNG of a video segment to verify cut boundaries or B-roll placement. Returns the absolute path to the preview image.
Instructions
Generates a filmstrip PNG of the specified video segment. Useful for visually verifying cut boundaries or B-roll placement. Returns the absolute path to the generated PNG file.
Input Schema
| Name | Required | Description | Default |
|---|---|---|---|
| request | Yes |
Output Schema
| Name | Required | Description | Default |
|---|---|---|---|
| result | Yes |
Implementation Reference
- server.py:103-134 (handler)Main tool handler for omni_video_preview. Takes a PreviewRequest (file_path, start_time, end_time), validates the video file exists, creates output directory, and calls render_timeline() to generate a filmstrip PNG with waveform. Returns the path to the generated preview PNG.
@mcp.tool() async def omni_video_preview(request: PreviewRequest) -> str: """ Generates a filmstrip PNG of the specified video segment. Useful for visually verifying cut boundaries or B-roll placement. Returns the absolute path to the generated PNG file. """ video_path = Path(request.file_path).resolve() if not video_path.exists(): return f"Error: File {video_path} not found." out_dir = video_path.parent / "edit" / "verify" out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"{video_path.stem}_{request.start_time:.2f}-{request.end_time:.2f}.png" # Attempt to auto-resolve transcript for shading transcript_path = video_path.parent / "edit" / "transcripts" / f"{video_path.stem}.json" if not transcript_path.exists(): transcript_path = None try: render_timeline( video=video_path, start=request.start_time, end=request.end_time, out_path=out_path, n_frames=10, transcript=transcript_path ) return f"Success: Preview generated at {out_path}" except Exception as e: return f"Error generating preview: {e}" - server.py:19-22 (schema)Pydantic model PreviewRequest defining input schema: file_path (str), start_time (float), end_time (float).
class PreviewRequest(BaseModel): file_path: str = Field(..., description="Absolute path to the video file.") start_time: float = Field(..., description="Start time in seconds.") end_time: float = Field(..., description="End time in seconds.") - server.py:103-104 (registration)Tool is registered via the @mcp.tool() decorator on the async function omni_video_preview.
@mcp.tool() async def omni_video_preview(request: PreviewRequest) -> str: - helpers/timeline_view.py:184-331 (helper)Core helper function render_timeline() that extracts frames via ffmpeg, composites a filmstrip, renders audio waveform with word labels and silence shading, and saves as PNG.
def render_timeline( video: Path, start: float, end: float, out_path: Path, n_frames: int, transcript: Path | None, ) -> None: # Frame extraction with tempfile.TemporaryDirectory() as tmp: tmp_dir = Path(tmp) print(f"extracting {n_frames} frames from {start:.2f}s to {end:.2f}s") frame_paths = extract_frames(video, start, end, n_frames, tmp_dir) # Layout metrics canvas_width = 1920 frame_h = 180 filmstrip_y = 50 filmstrip_h = frame_h wave_y = filmstrip_y + filmstrip_h + 20 wave_h = 220 label_y = wave_y + wave_h + 10 canvas_height = label_y + 60 # Load + resize frames to uniform height and compute total width imgs: list[Image.Image] = [] for fp in frame_paths: img = Image.open(fp).convert("RGB") aspect = img.width / img.height new_w = int(frame_h * aspect) imgs.append(img.resize((new_w, frame_h), Image.LANCZOS)) total_frame_w = sum(img.width for img in imgs) + (len(imgs) - 1) * 4 content_w = max(1400, total_frame_w) canvas_width = max(canvas_width, content_w + 100) canvas = Image.new("RGB", (canvas_width, canvas_height), BG) draw = ImageDraw.Draw(canvas, "RGBA") header_font = load_font(22) label_font = load_font(14) small_font = load_font(12) # Header — time range draw.text( (50, 12), f"{video.name} {start:.2f}s → {end:.2f}s ({(end - start):.2f}s, {n_frames} frames)", fill=FG, font=header_font, ) # Filmstrip x = 50 strip_width = canvas_width - 100 if total_frame_w <= strip_width: cursor = 50 for img in imgs: canvas.paste(img, (cursor, filmstrip_y)) cursor += img.width + 4 draw_width = cursor - 50 else: scale = strip_width / total_frame_w new_h = int(frame_h * scale) cursor = 50 for img in imgs: new_w = int(img.width * scale) scaled = img.resize((new_w, new_h), Image.LANCZOS) canvas.paste(scaled, (cursor, filmstrip_y + (filmstrip_h - new_h) // 2)) cursor += new_w + max(2, int(4 * scale)) draw_width = cursor - 50 strip_x0 = 50 strip_x1 = 50 + draw_width strip_span = strip_x1 - strip_x0 def time_to_x(t: float) -> int: frac = (t - start) / max(1e-6, (end - start)) return int(strip_x0 + frac * strip_span) # Waveform background draw.rectangle((strip_x0, wave_y, strip_x1, wave_y + wave_h), fill=(28, 28, 34)) # Silence shading (under the waveform) words = words_in_range(transcript, start, end) if transcript else [] silences = find_silences(words, start, end, threshold=0.4) if words else [] for a, b in silences: xa = time_to_x(a) xb = time_to_x(b) draw.rectangle((xa, wave_y, xb, wave_y + wave_h), fill=SILENCE) # Waveform envelope env = compute_envelope(video, start, end, samples=max(strip_span, 200)) mid_y = wave_y + wave_h // 2 max_amp = wave_h // 2 - 8 points_top: list[tuple[int, int]] = [] points_bot: list[tuple[int, int]] = [] for i, v in enumerate(env): xi = strip_x0 + int(i * strip_span / max(1, len(env) - 1)) a = int(v * max_amp) points_top.append((xi, mid_y - a)) points_bot.append((xi, mid_y + a)) if points_top: draw.line(points_top, fill=WAVE, width=1, joint="curve") draw.line(points_bot, fill=WAVE, width=1, joint="curve") # Fill between poly = points_top + list(reversed(points_bot)) draw.polygon(poly, fill=(*WAVE, 60)) # Word labels above the waveform (only words lasting ≥ 120ms to avoid clutter) last_label_x = -9999 for w in words: if w.get("type") != "word": continue ws = w.get("start") we = w.get("end") text = (w.get("text") or "").strip() if not text or ws is None or we is None: continue if (we - ws) < 0.05: continue cx = (time_to_x(ws) + time_to_x(we)) // 2 if cx - last_label_x < 28: continue # Tiny tick on the waveform draw.line((cx, wave_y - 4, cx, wave_y), fill=DIM, width=1) # Text above the waveform draw.text((cx + 2, wave_y - 18), text, fill=FG, font=small_font) last_label_x = cx # Time ruler below waveform ruler_y = wave_y + wave_h + 2 n_ticks = 6 for i in range(n_ticks + 1): frac = i / n_ticks t = start + frac * (end - start) xi = strip_x0 + int(frac * strip_span) draw.line((xi, ruler_y, xi, ruler_y + 6), fill=DIM, width=1) draw.text((xi - 20, ruler_y + 8), f"{t:.2f}s", fill=DIM, font=label_font) # Silences legend if any if silences: txt = f"shaded bands = silences ≥ 400ms ({len(silences)} gap(s))" draw.text((strip_x0, label_y + 30), txt, fill=DIM, font=label_font) out_path.parent.mkdir(parents=True, exist_ok=True) canvas.save(out_path, "PNG", optimize=True) print(f"saved: {out_path} ({out_path.stat().st_size // 1024} KB)") - helpers/timeline_view.py:37-62 (helper)Helper function extract_frames() used by render_timeline to extract N evenly spaced frames from a video segment using ffmpeg.
def extract_frames(video: Path, start: float, end: float, n: int, dest_dir: Path) -> list[Path]: """Extract N frames evenly spaced across [start, end]. Returns paths in order.""" dest_dir.mkdir(parents=True, exist_ok=True) if n < 1: n = 1 if n == 1: times = [(start + end) / 2.0] else: step = (end - start) / (n - 1) times = [start + i * step for i in range(n)] paths: list[Path] = [] for i, t in enumerate(times): out = dest_dir / f"f_{i:03d}.jpg" cmd = [ "ffmpeg", "-y", "-ss", f"{t:.3f}", "-i", str(video), "-frames:v", "1", "-q:v", "4", "-vf", "scale=320:-2", str(out), ] subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) paths.append(out) return paths