import { z } from "zod";
import { type InferSchema } from "xmcp";
import { YouTubeClient, withCache } from "../utils/youtube-client";
import { withToolUsageLogging } from "../utils/tool-usage-log";
export const schema = {
videoId: z.string().describe("The YouTube video ID (e.g., 'dQw4w9WgXcQ')"),
};
export const metadata = {
name: "get_youtube_transcript",
description: "Get the transcript for a YouTube video with timestamps for each segment",
annotations: {
title: "Get YouTube Transcript",
readOnlyHint: true,
destructiveHint: false,
idempotentHint: true,
},
};
function msToTimestamp(ms: number | undefined) {
if (typeof ms !== "number" || !isFinite(ms) || ms < 0) return undefined;
const totalSeconds = Math.floor(ms / 1000);
const hours = Math.floor(totalSeconds / 3600);
const minutes = Math.floor((totalSeconds % 3600) / 60);
const seconds = totalSeconds % 60;
const millis = Math.floor(ms % 1000);
const hh = hours.toString().padStart(2, "0");
const mm = minutes.toString().padStart(2, "0");
const ss = seconds.toString().padStart(2, "0");
const mmm = millis.toString().padStart(3, "0");
return `${hh}:${mm}:${ss}.${mmm}`;
}
async function get_youtube_transcript({ videoId }: InferSchema<typeof schema>) {
const cacheKey = `transcript_${videoId}_with_timestamps`;
return withCache(cacheKey, async () => {
const info = await YouTubeClient.withRetry((client) => client.getInfo(videoId));
const transcriptInfo = await info.getTranscript();
const body = transcriptInfo?.transcript?.content?.body;
if (!body || !body.initial_segments || !Array.isArray(body.initial_segments)) {
throw new Error("No transcript available for this video");
}
const rawSegments: any[] = body.initial_segments;
// Build segments with robust start/end extraction across potential field names
const mapped = rawSegments.map((segment, idx) => {
// Extract timing information - YouTube returns these as strings, so convert to numbers
let startMs: number | undefined;
let endMs: number | undefined;
// Handle string-based timing from YouTube transcript API
if (segment?.start_ms) {
startMs = parseInt(segment.start_ms, 10);
} else if (segment?.startMs) {
startMs = typeof segment.startMs === "number" ? segment.startMs : parseInt(segment.startMs, 10);
}
if (segment?.end_ms) {
endMs = parseInt(segment.end_ms, 10);
} else if (segment?.endMs) {
endMs = typeof segment.endMs === "number" ? segment.endMs : parseInt(segment.endMs, 10);
}
// Fallback to duration-based calculation
if (startMs && !endMs && segment?.duration_ms) {
const durationMs = parseInt(segment.duration_ms, 10);
if (!isNaN(durationMs)) {
endMs = startMs + durationMs;
}
}
// Fallback: use next segment's start as this segment's end
if (startMs && !endMs && idx < rawSegments.length - 1) {
const next = rawSegments[idx + 1];
if (next?.start_ms) {
endMs = parseInt(next.start_ms, 10);
} else if (next?.startMs) {
endMs = typeof next.startMs === "number" ? next.startMs : parseInt(next.startMs, 10);
}
}
// Extract text content
const text = segment?.snippet?.text ?? segment?.text ?? "";
// Convert to seconds for convenience
const startSeconds = startMs ? startMs / 1000 : undefined;
const endSeconds = endMs ? endMs / 1000 : undefined;
return {
text,
startMs,
endMs,
start: startMs ? msToTimestamp(startMs) : undefined,
end: endMs ? msToTimestamp(endMs) : undefined,
startSeconds,
endSeconds,
};
});
// Always return timestamped segments
return {
content: [
{
type: "text",
text: JSON.stringify(mapped, null, 2),
},
],
};
});
}
export default withToolUsageLogging(get_youtube_transcript, metadata.name);