indexVisualContent
Analyze video frames to create a searchable visual index using OCR, feature recognition, and optional AI descriptions for content discovery.
Instructions
Build a real visual index for a video using extracted frames, Apple Vision OCR, Apple Vision feature prints, and optional Gemini frame descriptions. Returns frame evidence with local image paths.
Input Schema
TableJSON Schema
| Name | Required | Description | Default |
|---|---|---|---|
| videoIdOrUrl | Yes | Video ID or URL to index visually | |
| intervalSec | No | Frame sampling interval in seconds (default 20) | |
| maxFrames | No | Maximum frames to analyze (default 12) | |
| imageFormat | No | ||
| width | No | ||
| autoDownload | No | Automatically download a small local video copy if none exists (default true) | |
| downloadFormat | No | Video format used if auto-download is needed (default worst_video) | |
| forceReindex | No | Re-run OCR/description analysis even if frames are already indexed | |
| includeGeminiDescriptions | No | Use Gemini to describe each frame when a Gemini key is configured | |
| includeGeminiEmbeddings | No | Generate Gemini embeddings over OCR/description text for semantic retrieval (default true when Gemini key is available) | |
| dryRun | No |
Implementation Reference
- src/lib/visual-search.ts:392-528 (handler)The `indexVideo` method in `VisualSearchEngine` is the handler for indexing visual content of a video, including OCR, scene description, and semantic embeddings.
async indexVideo(params: IndexVisualContentParams): Promise<IndexVisualContentResult> { const videoId = params.videoId; const sourceVideoUrl = params.sourceVideoUrl ?? `https://www.youtube.com/watch?v=${videoId}`; const intervalSec = clamp(params.intervalSec ?? 20, 2, 3600); const maxFrames = clamp(params.maxFrames ?? 12, 1, 100); const includeGeminiDescriptions = params.includeGeminiDescriptions ?? this.geminiDescriber.available; const descriptionProvider: "none" | "gemini" = includeGeminiDescriptions && this.geminiDescriber.available ? "gemini" : "none"; const embeddingSelection = resolveGeminiEmbeddingSelection(params.includeGeminiEmbeddings); const embeddingProvider = embeddingSelection ? await createEmbeddingProvider(embeddingSelection) : null; const embeddingProviderKind: "none" | "gemini" = embeddingProvider ? "gemini" : "none"; if (params.forceReindex) { this.store.removeFramesForVideo(videoId); } let autoDownloaded = false; let videoAssetPath = this.findVideoAsset(videoId)?.filePath; if (!videoAssetPath && (params.autoDownload ?? true)) { const download = await this.mediaDownloader.download({ videoIdOrUrl: videoId, format: params.downloadFormat ?? "worst_video", }); videoAssetPath = download.asset.filePath; autoDownloaded = true; } if (!videoAssetPath) { throw new Error(`No local video asset found for ${videoId}. Run downloadAsset first or allow autoDownload.`); } const keyframes = await this.thumbnailExtractor.extractKeyframes({ videoId, videoPath: videoAssetPath, intervalSec, maxFrames, imageFormat: params.imageFormat, width: params.width, }); const existingByPath = new Map(this.store.listFrames({ videoId }).map((frame) => [frame.framePath, frame])); const pendingAssets = keyframes.assets.filter((asset) => params.forceReindex || !existingByPath.has(asset.filePath)); // Run OCR and Gemini descriptions IN PARALLEL — they're independent const [analyses, descriptions] = await Promise.all([ pendingAssets.length > 0 ? this.visionAnalyzer.analyzeFrames(pendingAssets.map((asset) => asset.filePath)) : Promise.resolve([]), descriptionProvider === "gemini" ? this.geminiDescriber.describeFrames(pendingAssets.map((asset) => ({ framePath: asset.filePath, videoId, timestampSec: asset.timestampSec ?? 0, }))) : Promise.resolve([]), ]); const analysisByPath = new Map(analyses.map((analysis) => [analysis.framePath, analysis])); const descriptionByPath = new Map(descriptions.map((item) => [item.framePath, item.description])); const retrievalTexts = pendingAssets.map((asset) => { const analysis = analysisByPath.get(asset.filePath); const description = descriptionByPath.get(asset.filePath); return buildRetrievalText({ timestampSec: asset.timestampSec ?? 0, ocrText: analysis?.ocrText, visualDescription: description, }); }); const textEmbeddings = embeddingProvider ? await embeddingProvider.embedDocuments(retrievalTexts.map((text) => text || "frame without visible text")) : []; const embeddingByPath = new Map<string, number[]>(); pendingAssets.forEach((asset, index) => { if (textEmbeddings[index]?.length) { embeddingByPath.set(asset.filePath, textEmbeddings[index]!); } }); const evidence: VisualIndexRecord[] = []; for (const asset of keyframes.assets) { const existing = existingByPath.get(asset.filePath); if (existing && !params.forceReindex) { evidence.push(existing); continue; } const analysis = analysisByPath.get(asset.filePath); const visualDescription = descriptionByPath.get(asset.filePath); const retrievalText = buildRetrievalText({ timestampSec: asset.timestampSec ?? 0, ocrText: analysis?.ocrText, visualDescription, }); const record = this.store.upsertFrame({ videoId, frameAssetId: asset.assetId, framePath: asset.filePath, timestampSec: asset.timestampSec ?? 0, sourceVideoUrl, sourceVideoTitle: params.sourceVideoTitle, ocrText: analysis?.ocrText, ocrConfidence: analysis?.ocrConfidence, visualDescription, retrievalText, featureVector: analysis?.featureVector, textEmbedding: embeddingByPath.get(asset.filePath), descriptionModel: descriptionProvider === "gemini" ? this.geminiDescriber.model : undefined, embeddingProvider: embeddingProviderKind, embeddingModel: embeddingProvider?.selection.model, embeddingDimensions: embeddingProvider?.selection.dimensions, }); evidence.push(record); } return { videoId, sourceVideoUrl, sourceVideoTitle: params.sourceVideoTitle, videoAssetPath, autoDownloaded, framesExtracted: keyframes.framesExtracted, framesAnalyzed: pendingAssets.length, framesIndexed: evidence.length, intervalSec, maxFrames, descriptionProvider, descriptionModel: descriptionProvider === "gemini" ? this.geminiDescriber.model : undefined, embeddingProvider: embeddingProviderKind, embeddingModel: embeddingProvider?.selection.model, embeddingDimensions: embeddingProvider?.selection.dimensions, evidence: evidence.sort((a, b) => a.timestampSec - b.timestampSec).slice(0, 12), limitations: buildIndexLimitations(descriptionProvider, embeddingProviderKind), }; } - src/server/mcp-server.ts:1308-1323 (registration)The tool `indexVisualContent` is registered in the main MCP server request handler and calls the `service.indexVisualContent` method.
case "indexVisualContent": return service.indexVisualContent( { videoIdOrUrl: readString(args, "videoIdOrUrl"), intervalSec: optionalNumber(args, "intervalSec"), maxFrames: optionalNumber(args, "maxFrames"), imageFormat: optionalEnum(args, "imageFormat", ["jpg", "png", "webp"]), width: optionalNumber(args, "width"), autoDownload: optionalBoolean(args, "autoDownload"), downloadFormat: optionalEnum(args, "downloadFormat", ["best_video", "worst_video"]), forceReindex: optionalBoolean(args, "forceReindex"), includeGeminiDescriptions: optionalBoolean(args, "includeGeminiDescriptions"), includeGeminiEmbeddings: optionalBoolean(args, "includeGeminiEmbeddings"), }, { dryRun }, ); - src/server/mcp-server.ts:516-536 (schema)The JSON schema definition for the `indexVisualContent` tool within the MCP server definition.
name: "indexVisualContent", description: "Build a real visual index for a video using extracted frames, Apple Vision OCR, Apple Vision feature prints, and optional Gemini frame descriptions. Returns frame evidence with local image paths.", inputSchema: { type: "object", properties: { videoIdOrUrl: { type: "string", description: "Video ID or URL to index visually" }, intervalSec: { type: "number", minimum: 2, maximum: 3600, description: "Frame sampling interval in seconds (default 20)" }, maxFrames: { type: "number", minimum: 1, maximum: 100, description: "Maximum frames to analyze (default 12)" }, imageFormat: { type: "string", enum: ["jpg", "png", "webp"] }, width: { type: "number", minimum: 160, maximum: 3840 }, autoDownload: { type: "boolean", description: "Automatically download a small local video copy if none exists (default true)" }, downloadFormat: { type: "string", enum: ["best_video", "worst_video"], description: "Video format used if auto-download is needed (default worst_video)" }, forceReindex: { type: "boolean", description: "Re-run OCR/description analysis even if frames are already indexed" }, includeGeminiDescriptions: { type: "boolean", description: "Use Gemini to describe each frame when a Gemini key is configured" }, includeGeminiEmbeddings: { type: "boolean", description: "Generate Gemini embeddings over OCR/description text for semantic retrieval (default true when Gemini key is available)" }, dryRun: { type: "boolean" }, }, required: ["videoIdOrUrl"], additionalProperties: false, }, },