EnriVision

AnalyzeMediaTool.ts•23.5 KiB

/** * ANALYZE MEDIA TOOL * * Implements the `analyze_media` MCP tool: * - Validates local file path * - Streams the file to EnriProxy via resumable uploads * - Triggers server-side analysis and returns text-only results * * @module tools/AnalyzeMediaTool */ import { existsSync } from "node:fs"; import { stat, open } from "node:fs/promises"; import { basename, extname, isAbsolute } from "node:path"; import { randomUUID } from "node:crypto"; import { lookup as mimeLookup } from "mime-types"; import { EnriProxyHttpError, type CreateUploadSessionResponse, type EnriProxyClient } from "../client/EnriProxyClient.js"; import { assertHttpUrl, assertNonEmptyString, assertObject, optionalBoolean, optionalInt, optionalNumber, optionalString } from "../shared/validation.js"; import { computeTarSizeBytes, TarStream, type TarEntry } from "../shared/tar.js"; /** * Content type for EnriVision media-set archives (tar, no compression). */ const ENRIVISION_MEDIA_SET_TAR_CONTENT_TYPE = "application/vnd.enrivision.media-set+tar"; /** * Fixed manifest entry name inside EnriVision media-set tar archives. */ const ENRIVISION_MEDIA_SET_TAR_MANIFEST_NAME = "manifest.json"; /** * Tool parameters for `analyze_media`. */ export interface AnalyzeMediaToolParams { /** * Absolute local filesystem path on the MCP host. * * @remarks * Use `paths` to analyze multiple images in a single call. */ readonly path?: string; /** * Absolute local filesystem paths on the MCP host. * * @remarks * When provided, EnriVision uploads the files as a single media-set archive * (resumable, up to 4GB) and triggers server-side batching + reduce. * * This is intended for many UI screenshots / photo sets. */ readonly paths?: ReadonlyArray<string>; /** * Optional analysis hint (ui, diagram, chart, error, code, meeting, tutorial, photo). */ readonly context?: string; /** * Optional explicit user question. */ readonly question?: string; /** * Preferred response language code (e.g., "es", "en"). */ readonly language?: string; /** * Optional max frames override for videos (1-20). */ readonly maxFrames?: number; /** * Optional override for transcription on videos. */ readonly transcribe?: boolean; /** * Optional transcription language hint for Whisper. */ readonly transcriptionLanguage?: string; /** * Optional analysis mode selector (auto|single|multipass). */ readonly analysisMode?: "auto" | "single" | "multipass"; /** * Optional video multipass tuning. */ readonly video?: { /** * Optional clip start offset in seconds for targeted video analysis. * * @remarks * Use this when the question references a specific timestamp to avoid * scanning the full timeline. */ readonly clipStartSeconds?: number; /** * Optional clip duration in seconds for targeted video analysis. * * @remarks * Use together with {@link clipStartSeconds} to analyze only a time window. */ readonly clipDurationSeconds?: number; /** * Segment duration in seconds. */ readonly segmentSeconds?: number; /** * Maximum number of segments to analyze. */ readonly maxSegments?: number; /** * Maximum frames per segment. */ readonly maxFramesPerSegment?: number; }; /** * Optional document multipass tuning (PDF). */ readonly document?: { /** * Maximum pages to analyze in total. */ readonly maxPagesTotal?: number; /** * Pages per batch. */ readonly pagesPerBatch?: number; /** * Maximum rendered pages per batch. */ readonly maxImagesPerBatch?: number; /** * Minimum extracted text length to treat a page as textual. */ readonly scannedTextThresholdChars?: number; }; /** * Optional audio multipass tuning. */ readonly audio?: { /** * Whether to include timestamped segments in audio extraction. */ readonly timestamps?: boolean; /** * Segment duration in seconds for audio multipass. */ readonly segmentSeconds?: number; /** * Maximum number of audio segments to analyze. */ readonly maxSegments?: number; }; /** * Optional image-set multipass tuning. * * @remarks * Used only when analyzing multiple images via `paths`. */ readonly images?: { /** * Maximum number of images to analyze in total. */ readonly maxImagesTotal?: number; /** * Images per batch for multipass map calls. */ readonly imagesPerBatch?: number; /** * Maximum dimension for images (width/height). */ readonly maxDimension?: number; }; } /** * Structured result for `analyze_media`. */ export interface AnalyzeMediaToolResult extends Record<string, unknown> { /** * Text analysis produced by EnriProxy. */ readonly analysis: string; /** * Detected media type. */ readonly media_type: string; /** * Extraction metadata returned by the server. * * @remarks * This metadata is intended for debugging and transparency (e.g., duration, * selected frames, warnings). Internal identifiers like upload ids are * stripped to avoid leaking implementation details into the model context. */ readonly extraction: Record<string, unknown>; } /** * Dependencies for {@link AnalyzeMediaTool}. */ export interface AnalyzeMediaToolDeps { /** * Creates an EnriProxy client with a base URL, API key, and timeout. * * @param serverUrl - EnriProxy URL * @param apiKey - EnriProxy API key * @param timeoutMs - Timeout in ms * @returns Client instance */ readonly createClient: (serverUrl: string, apiKey: string, timeoutMs: number) => EnriProxyClient; /** * Default EnriProxy server URL. */ readonly defaultServerUrl: string; /** * Default EnriProxy API key. */ readonly defaultApiKey: string; /** * Default timeout in milliseconds. */ readonly defaultTimeoutMs: number; } /** * MCP tool that uploads and analyzes local media. */ export class AnalyzeMediaTool { /** * Tool dependencies. */ private readonly deps: AnalyzeMediaToolDeps; /** * Creates a new {@link AnalyzeMediaTool}. * * @param deps - Tool dependencies */ public constructor(deps: AnalyzeMediaToolDeps) { this.deps = deps; } /** * Validates raw MCP tool arguments. * * @param raw - Raw tool arguments * @returns Validated parameters */ public parseParams(raw: unknown): AnalyzeMediaToolParams { const obj = assertObject(raw, "arguments"); const pathRaw = typeof obj["path"] === "string" ? obj["path"].trim() : ""; const path = pathRaw ? pathRaw : undefined; if (path && !isAbsolute(path)) { throw new Error("path must be an absolute file path."); } const pathsRaw = obj["paths"]; let paths: string[] | undefined; if (typeof pathsRaw !== "undefined") { if (!Array.isArray(pathsRaw)) { throw new Error("paths must be an array of absolute file paths."); } const out: string[] = []; for (const item of pathsRaw) { if (typeof item !== "string" || !item.trim()) { continue; } const p = item.trim(); if (!isAbsolute(p)) { throw new Error("paths must contain only absolute file paths."); } out.push(p); } if (out.length > 0) { paths = out; } } if (!path && (!paths || paths.length === 0)) { throw new Error("Provide either 'path' or 'paths'."); } const context = optionalString(obj["context"]); const question = optionalString(obj["question"]); const language = optionalString(obj["language"]); const maxFrames = optionalInt(obj["max_frames"]); const transcribe = optionalBoolean(obj["transcribe"]); const transcriptionLanguage = optionalString(obj["transcription_language"]); const analysisModeRaw = optionalString(obj["analysis_mode"]); const analysisMode = analysisModeRaw === "auto" || analysisModeRaw === "single" || analysisModeRaw === "multipass" ? analysisModeRaw : analysisModeRaw ? (() => { throw new Error("analysis_mode must be one of: auto|single|multipass."); })() : undefined; const videoRaw = obj["video"]; let video: AnalyzeMediaToolParams["video"] | undefined; if (typeof videoRaw !== "undefined") { const v = assertObject(videoRaw, "video"); video = { clipStartSeconds: optionalNumber(v["clip_start_seconds"]), clipDurationSeconds: optionalNumber(v["clip_duration_seconds"]), segmentSeconds: optionalNumber(v["segment_seconds"]), maxSegments: optionalInt(v["max_segments"]), maxFramesPerSegment: optionalInt(v["max_frames_per_segment"]) }; } const documentRaw = obj["document"]; let document: AnalyzeMediaToolParams["document"] | undefined; if (typeof documentRaw !== "undefined") { const d = assertObject(documentRaw, "document"); document = { maxPagesTotal: optionalInt(d["max_pages_total"]), pagesPerBatch: optionalInt(d["pages_per_batch"]), maxImagesPerBatch: optionalInt(d["max_images_per_batch"]), scannedTextThresholdChars: optionalInt(d["scanned_text_threshold_chars"]) }; } const audioRaw = obj["audio"]; let audio: AnalyzeMediaToolParams["audio"] | undefined; if (typeof audioRaw !== "undefined") { const a = assertObject(audioRaw, "audio"); audio = { timestamps: optionalBoolean(a["timestamps"]), segmentSeconds: optionalNumber(a["segment_seconds"]), maxSegments: optionalInt(a["max_segments"]) }; } const imagesRaw = obj["images"]; let images: AnalyzeMediaToolParams["images"] | undefined; if (typeof imagesRaw !== "undefined") { const img = assertObject(imagesRaw, "images"); images = { maxImagesTotal: optionalInt(img["max_images_total"]), imagesPerBatch: optionalInt(img["images_per_batch"]), maxDimension: optionalInt(img["max_dimension"]) }; } return { path, paths, context, question, language, maxFrames, transcribe, transcriptionLanguage, analysisMode, video, document, audio, images }; } /** * Executes the tool. * * @param params - Validated parameters * @returns Tool result */ public async execute(params: AnalyzeMediaToolParams): Promise<AnalyzeMediaToolResult> { const serverUrl = assertHttpUrl(this.deps.defaultServerUrl, "ENRIPROXY_URL"); const apiKey = assertNonEmptyString(this.deps.defaultApiKey, "ENRIPROXY_API_KEY"); const timeoutMs = this.deps.defaultTimeoutMs; const clientTraceId = `enrivision_${randomUUID()}`; const client = this.deps.createClient(serverUrl, apiKey, timeoutMs); const resolvedPaths = Array.isArray(params.paths) && params.paths.length > 0 ? [...params.paths] : typeof params.path === "string" && params.path.trim() ? [params.path.trim()] : []; if (resolvedPaths.length === 0) { throw new Error("Provide either 'path' or 'paths'."); } let uploadId: string; if (resolvedPaths.length > 1) { uploadId = await this.uploadImageSetAsMediaSetTar( client, resolvedPaths, timeoutMs, clientTraceId ); } else { const singlePath = resolvedPaths[0]!; const fileSize = await this.assertReadableFile(singlePath); const filename = basename(singlePath); const contentType = this.detectMimeType(singlePath); const session = await client.createUploadSession({ filename, sizeBytes: fileSize, contentType, clientTraceId }); const finalOffset = await this.uploadFileResumable(client, singlePath, fileSize, session, timeoutMs); if (finalOffset !== fileSize) { throw new Error(`Upload incomplete: sent ${finalOffset} of ${fileSize} bytes.`); } uploadId = session.upload_id; } const defaultLanguageRaw = typeof process.env["ENRIVISION_DEFAULT_LANGUAGE"] === "string" ? process.env["ENRIVISION_DEFAULT_LANGUAGE"].trim() : ""; const language = typeof params.language === "string" && params.language.trim() ? params.language.trim() : defaultLanguageRaw ? defaultLanguageRaw : undefined; const analysis = await client.analyze({ uploadId, context: params.context, question: params.question, language, maxFrames: params.maxFrames, transcribe: params.transcribe, transcriptionLanguage: params.transcriptionLanguage, analysisMode: params.analysisMode, video: params.video, document: params.document, audio: params.audio, images: params.images }); const extraction = this.stripInternalExtractionFields(analysis.extraction); return { analysis: analysis.analysis, media_type: analysis.media_type, extraction }; } /** * Removes internal identifiers (upload ids, routing-only values, etc.) from the * extraction payload before returning it to the model. * * @param extraction - Raw extraction object returned by EnriProxy * @returns Sanitized extraction object */ private stripInternalExtractionFields( extraction: Record<string, unknown> ): Record<string, unknown> { const stripped = this.stripInternalFields(extraction); return this.asPlainObject(stripped); } /** * Recursively strips internal fields from an unknown value. * * @param value - Unknown value to sanitize * @returns Sanitized value */ private stripInternalFields(value: unknown): unknown { if (Array.isArray(value)) { return value.map((item) => this.stripInternalFields(item)); } if (!value || typeof value !== "object") { return value; } const record = value as Record<string, unknown>; const next: Record<string, unknown> = {}; for (const [key, child] of Object.entries(record)) { if (key === "upload_id" || key === "uploadId") { continue; } if (key === "detected_media_type") { continue; } if (key === "analysis_mode_requested" || key === "analysis_mode_used") { continue; } if (key === "multipass") { continue; } if (key === "model") { continue; } next[key] = this.stripInternalFields(child); } return next; } /** * Ensures the sanitized extraction value is a plain JSON object. * * @param value - Sanitized value * @returns Plain object */ private asPlainObject(value: unknown): Record<string, unknown> { if (value && typeof value === "object" && !Array.isArray(value)) { return value as Record<string, unknown>; } return {}; } /** * Validates that a path exists and is a readable file. * * @param filePath - Local filesystem path * @returns File size in bytes */ private async assertReadableFile(filePath: string): Promise<number> { if (!existsSync(filePath)) { throw new Error(`File not found: ${filePath}`); } const st = await stat(filePath); if (!st.isFile()) { throw new Error(`Not a file: ${filePath}`); } // Ensure the file is readable. const handle = await open(filePath, "r"); try { return st.size; } finally { await handle.close(); } } /** * Detects MIME type using file extension. * * @param filePath - File path * @returns MIME type string */ private detectMimeType(filePath: string): string { const detected = mimeLookup(filePath); if (typeof detected === "string" && detected.trim()) { return detected.trim(); } return "application/octet-stream"; } /** * Uploads multiple local images as a single EnriVision media-set tar archive. * * @remarks * This avoids creating many concurrent resumable upload sessions (which are * capped per API key) and enables server-side batching + reduce for large * screenshot sets. * * @param client - EnriProxy client * @param filePaths - Absolute image file paths * @param timeoutMs - Request timeout per HTTP request * @param clientTraceId - Client trace id for correlation * @returns Upload id for the created tar session */ private async uploadImageSetAsMediaSetTar( client: EnriProxyClient, filePaths: ReadonlyArray<string>, timeoutMs: number, clientTraceId: string ): Promise<string> { if (filePaths.length < 2) { throw new Error("uploadImageSetAsMediaSetTar requires at least 2 file paths."); } const files: Array<{ index: number; path: string; filename: string; sizeBytes: number; contentType: string; entryName: string; }> = []; for (let i = 0; i < filePaths.length; i += 1) { const p = filePaths[i]!; const sizeBytes = await this.assertReadableFile(p); const filename = basename(p); const contentType = this.detectMimeType(p); if (!contentType.toLowerCase().startsWith("image/")) { throw new Error(`paths must contain only image files. Not an image: ${p} (${contentType})`); } const extRaw = extname(filename).toLowerCase(); const ext = extRaw && /^[a-z0-9.]+$/.test(extRaw) ? extRaw : ".img"; const entryName = `${String(i + 1).padStart(6, "0")}${ext}`; files.push({ index: i + 1, path: p, filename, sizeBytes, contentType, entryName }); } const manifest = { type: "enrivision_media_set", version: 1, media_type: "image_set", items: files.map((f) => ({ index: f.index, name: f.entryName, filename: f.filename, content_type: f.contentType, size_bytes: f.sizeBytes })) }; const manifestBuffer = Buffer.from(JSON.stringify(manifest), "utf8"); const nowSeconds = Math.floor(Date.now() / 1000); const entries: TarEntry[] = [ { name: ENRIVISION_MEDIA_SET_TAR_MANIFEST_NAME, source: { type: "buffer", buffer: manifestBuffer }, mtimeSeconds: nowSeconds }, ...files.map( (f): TarEntry => ({ name: f.entryName, source: { type: "file", path: f.path, sizeBytes: f.sizeBytes }, mtimeSeconds: nowSeconds }) ) ]; const tarSizeBytes = computeTarSizeBytes(entries); const tar = new TarStream(entries); if (tar.getSizeBytes() !== tarSizeBytes) { throw new Error("Internal error: tar size mismatch."); } const session = await client.createUploadSession({ filename: "enrivision-image-set.tar", sizeBytes: tarSizeBytes, contentType: ENRIVISION_MEDIA_SET_TAR_CONTENT_TYPE, clientTraceId }); const chunkSize = Math.max(1, Math.floor(session.chunk_size_bytes)); let offset = await client.getUploadOffset(session.upload_id); if (offset < 0 || offset > tarSizeBytes) { throw new Error(`Invalid server offset for tar upload: ${offset}`); } while (offset < tarSizeBytes) { let madeProgress = false; for await (const chunk of tar.iterateChunks(offset, chunkSize)) { if (chunk.length === 0) { continue; } const expectedOffset = offset; const nextOffset = await this.uploadChunkWithRetry( client, session.upload_id, expectedOffset, chunk, timeoutMs ); offset = nextOffset; madeProgress = true; const progress = Math.floor((offset / tarSizeBytes) * 100); console.error(`enrivision: upload ${progress}% (${offset}/${tarSizeBytes} bytes)`); // Offset resync: restart generation from the server-provided offset. if (offset !== expectedOffset + chunk.length) { break; } if (offset >= tarSizeBytes) { break; } } if (!madeProgress) { throw new Error("Upload stalled: no progress while sending tar chunks."); } } if (offset !== tarSizeBytes) { throw new Error(`Upload incomplete: sent ${offset} of ${tarSizeBytes} bytes.`); } return session.upload_id; } /** * Uploads a file to EnriProxy in resumable chunks. * * @param client - EnriProxy client * @param filePath - Local file path * @param fileSize - Total file size in bytes * @param session - Server-created session * @param timeoutMs - Request timeout in milliseconds * @returns Final offset */ private async uploadFileResumable( client: EnriProxyClient, filePath: string, fileSize: number, session: CreateUploadSessionResponse, timeoutMs: number ): Promise<number> { const chunkSize = Math.max(1, Math.floor(session.chunk_size_bytes)); const handle = await open(filePath, "r"); try { let offset = await client.getUploadOffset(session.upload_id); while (offset < fileSize) { const remaining = fileSize - offset; const nextSize = Math.min(chunkSize, remaining); const buffer = Buffer.allocUnsafe(nextSize); const read = await handle.read(buffer, 0, nextSize, offset); if (read.bytesRead <= 0) { break; } const chunk = read.bytesRead === buffer.length ? buffer : buffer.subarray(0, read.bytesRead); offset = await this.uploadChunkWithRetry(client, session.upload_id, offset, chunk, timeoutMs); const progress = Math.floor((offset / fileSize) * 100); console.error(`enrivision: upload ${progress}% (${offset}/${fileSize} bytes)`); } return offset; } finally { await handle.close(); } } /** * Uploads a single chunk with retry and offset resync. * * @param client - EnriProxy client * @param uploadId - Upload id * @param offset - Expected offset * @param chunk - Chunk bytes * @param timeoutMs - Timeout in ms * @returns New offset */ private async uploadChunkWithRetry( client: EnriProxyClient, uploadId: string, offset: number, chunk: Buffer, timeoutMs: number ): Promise<number> { const maxAttempts = 5; for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { try { return await client.appendUploadChunk({ uploadId, offset, chunk, timeoutMs }); } catch (error) { const message = error instanceof Error ? error.message : String(error); if (error instanceof EnriProxyHttpError) { // Offset mismatch: resync and let the caller re-read at the correct offset. if (error.status === 409) { const actual = await client.getUploadOffset(uploadId); if (actual !== offset) { console.error(`enrivision: offset resync (${offset} -> ${actual})`); return actual; } } // Do not retry on client errors (except 409 which is handled above). if ( error.status === 400 || error.status === 401 || error.status === 403 || error.status === 404 || error.status === 410 || error.status === 413 ) { throw error; } } if (attempt === maxAttempts) { throw error; } const backoffMs = Math.min(1000 * Math.pow(2, attempt - 1), 10000); console.error(`enrivision: retry ${attempt}/${maxAttempts} after ${backoffMs}ms (${message})`); await new Promise((resolve) => setTimeout(resolve, backoffMs)); } } throw new Error("Upload failed after retries."); } }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Bedolla/EnriVision'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

AnalyzeMediaTool.ts•23.5 KiB