YouTube Info MCP Server

scraper.ts•13.4 KiB

import { VideoMetadata, Transcript, TranscriptSegment, YouTubePlayerResponse, YouTubeError, YouTubeErrorCode, CaptionTrack } from './types.js'; import { VideoIdSchema, YouTubePlayerResponseSchema, TranscriptResponseSchema } from './validation.js'; import { withRetry, isRetryableError } from './utils.js'; import { videoInfoCache, transcriptCache } from './cache.js'; import { logger } from './logger.js'; import { z } from 'zod'; const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; // InnerTube API configuration const INNERTUBE_API_URL = 'https://www.youtube.com/youtubei/v1/player'; const INNERTUBE_CONTEXT = { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } }; export async function fetchVideoInfo(videoId: string): Promise<{ metadata: VideoMetadata; captionTracks: CaptionTrack[] }> { // Validate video ID format with Zod try { VideoIdSchema.parse(videoId); } catch (error) { if (error instanceof z.ZodError) { throw new YouTubeError(error.errors[0].message, 'INVALID_ID'); } throw new YouTubeError('Invalid YouTube video ID format', 'INVALID_ID'); } // Check cache first const cached = videoInfoCache.get(videoId); if (cached) { logger.debug(`Cache hit for video ${videoId}`); return cached; } logger.info(`Fetching video info for ${videoId}`); const url = `https://www.youtube.com/watch?v=${videoId}`; try { // Fetch with retry logic const { html, response } = await withRetry(async () => { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept-Language': 'en-US,en;q=0.9', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', } }); if (!response.ok) { if (response.status === 404) { throw new YouTubeError('Video not found', 'NOT_FOUND'); } const error = new Error(`HTTP error ${response.status}`) as any; error.status = response.status; throw error; } const html = await response.text(); return { html, response }; }, { maxRetries: 3, baseDelay: 1000 }); // First try to extract InnerTube API key and use InnerTube API const apiKey = extractInnerTubeApiKey(html); let playerResponse: YouTubePlayerResponse | null = null; if (apiKey) { logger.debug('Attempting to fetch via InnerTube API'); playerResponse = await fetchViaInnerTube(videoId, apiKey); } // Fall back to extracting from HTML if InnerTube didn't work if (!playerResponse) { logger.debug('Falling back to HTML extraction'); playerResponse = extractPlayerResponse(html); } if (!playerResponse) { throw new YouTubeError('Could not extract video data from page', 'PARSE_ERROR'); } // Check if video is available if (playerResponse.playabilityStatus?.status !== 'OK') { const status = playerResponse.playabilityStatus?.status; const reason = playerResponse.playabilityStatus?.reason || 'Video is not available'; // Determine specific error type let errorCode: YouTubeErrorCode = 'PRIVATE'; if (reason.toLowerCase().includes('age')) { errorCode = 'AGE_RESTRICTED'; } else if (reason.toLowerCase().includes('region') || reason.toLowerCase().includes('country')) { errorCode = 'REGION_BLOCKED'; } throw new YouTubeError(reason, errorCode); } const metadata = extractMetadata(playerResponse); const captionTracks = extractCaptionTracks(playerResponse); const result = { metadata, captionTracks }; // Cache the result videoInfoCache.set(videoId, result); logger.debug(`Cached video info for ${videoId}`); return result; } catch (error) { logger.error(`Failed to fetch video ${videoId}:`, error); if (error instanceof YouTubeError) { throw error; } throw new YouTubeError( `Failed to fetch video: ${error instanceof Error ? error.message : 'Unknown error'}`, 'NETWORK_ERROR', error ); } } export async function fetchTranscript(captionTrack: CaptionTrack): Promise<Transcript> { // Create cache key from caption track URL const cacheKey = captionTrack.baseUrl; // Check cache first const cached = transcriptCache.get(cacheKey); if (cached) { logger.debug(`Cache hit for transcript`); return cached; } logger.info(`Fetching transcript for language: ${captionTrack.languageCode || 'unknown'}`); try { // Try multiple format options const formats = ['json3', 'srv3', 'vtt']; for (const fmt of formats) { const url = new URL(captionTrack.baseUrl); url.searchParams.set('fmt', fmt); // Try fetching with retry logic for each format let response: Response; let text: string; try { const result = await withRetry(async () => { const resp = await fetch(url.toString(), { headers: { 'User-Agent': USER_AGENT, 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://www.youtube.com/', } }); if (!resp.ok) { const error = new Error(`HTTP error ${resp.status}`) as any; error.status = resp.status; throw error; } const content = await resp.text(); return { response: resp, text: content }; }, { maxRetries: 2, baseDelay: 500 }); response = result.response; text = result.text; } catch (error) { // Skip to next format on error continue; } // Check if we got actual content if (!text || text.length === 0) { continue; } try { // Try to parse based on format let transcript: Transcript; if (fmt === 'vtt') { transcript = parseVTTTranscript(text); } else { transcript = parseTranscript(text); } // Cache the successful result transcriptCache.set(cacheKey, transcript); return transcript; } catch (e) { // Try next format continue; } } throw new YouTubeError( 'Transcript not available. The video may not have captions, or they may be restricted.', 'NO_TRANSCRIPT' ); } catch (error) { if (error instanceof YouTubeError) { throw error; } throw new YouTubeError( 'Failed to fetch transcript', 'NO_TRANSCRIPT', error ); } } function extractPlayerResponse(html: string): YouTubePlayerResponse | null { // Try multiple patterns to extract the player response const patterns = [ /var\s+ytInitialPlayerResponse\s*=\s*({.+?});/s, /ytInitialPlayerResponse\s*=\s*({.+?});/s, /"playerResponse":"(\\?.+?)\\?"/s ]; for (const pattern of patterns) { const match = html.match(pattern); if (match) { try { let jsonStr = match[1]; // If it's from the third pattern, we need to unescape it if (pattern === patterns[2]) { jsonStr = jsonStr.replace(/\\"/g, '"').replace(/\\\\/g, '\\'); } const parsed = JSON.parse(jsonStr); // Validate the response with Zod return YouTubePlayerResponseSchema.parse(parsed); } catch (e) { continue; } } } return null; } function extractInnerTubeApiKey(html: string): string | null { // Try multiple patterns to extract the API key const patterns = [ /"INNERTUBE_API_KEY":"([^"]+)"/, /"innertubeApiKey":"([^"]+)"/, /['"]INNERTUBE_API_KEY['"]:\s*['"]([^'"]+)['"]/, ]; for (const pattern of patterns) { const match = html.match(pattern); if (match && match[1]) { logger.debug(`Found InnerTube API key: ${match[1].substring(0, 10)}...`); return match[1]; } } logger.warn('Could not extract InnerTube API key'); return null; } async function fetchViaInnerTube(videoId: string, apiKey: string): Promise<YouTubePlayerResponse | null> { const url = `${INNERTUBE_API_URL}?key=${apiKey}`; try { const response = await fetch(url, { method: 'POST', headers: { 'User-Agent': USER_AGENT, 'Content-Type': 'application/json', 'Accept': 'application/json', 'Accept-Language': 'en-US,en;q=0.9', 'Origin': 'https://www.youtube.com', 'Referer': `https://www.youtube.com/watch?v=${videoId}`, }, body: JSON.stringify({ context: INNERTUBE_CONTEXT, videoId: videoId, }), }); if (!response.ok) { logger.warn(`InnerTube API returned status ${response.status}`); return null; } const data = await response.json(); // Validate the response with Zod try { return YouTubePlayerResponseSchema.parse(data); } catch (e) { logger.warn('InnerTube response validation failed'); return null; } } catch (error) { logger.error('InnerTube API request failed:', error); return null; } } function extractMetadata(playerResponse: YouTubePlayerResponse): VideoMetadata { const details = playerResponse.videoDetails; if (!details) { throw new YouTubeError('Video details not found', 'PARSE_ERROR'); } return { title: details.title || 'Unknown Title', author: details.author || 'Unknown Author', lengthSeconds: parseInt(details.lengthSeconds || '0', 10), viewCount: parseInt(details.viewCount || '0', 10), description: details.shortDescription || '', }; } function extractCaptionTracks(playerResponse: YouTubePlayerResponse): CaptionTrack[] { const tracks = playerResponse.captions?.playerCaptionsTracklistRenderer?.captionTracks; if (!tracks || tracks.length === 0) { return []; } return tracks.filter(track => track.baseUrl); } function parseTranscript(jsonText: string): Transcript { const segments: TranscriptSegment[] = []; let fullText = ''; try { const data = JSON.parse(jsonText); // Validate and parse the transcript response const validatedData = TranscriptResponseSchema.parse(data); // Handle json3 format from YouTube if (validatedData.events) { for (const event of validatedData.events) { // Skip music events and other non-text events if (!event.segs || !Array.isArray(event.segs)) { continue; } let eventText = ''; for (const seg of event.segs) { if (seg.utf8) { eventText += seg.utf8; } } if (eventText.trim()) { const startTime = (event.tStartMs || 0) / 1000; segments.push({ start: startTime, text: eventText.trim() }); fullText += eventText + ' '; } } } // Handle other potential JSON formats if (segments.length === 0 && validatedData.captions) { for (const caption of validatedData.captions) { if (caption.text) { segments.push({ start: caption.start || 0, text: caption.text }); fullText += caption.text + ' '; } } } } catch (e) { // If not JSON, try XML format const textRegex = /<text[^>]*start="([^"]*)"[^>]*>(.*?)<\/text>/gs; let match; while ((match = textRegex.exec(jsonText)) !== null) { const start = parseFloat(match[1]); const text = decodeXML(match[2]); if (text.trim()) { segments.push({ start, text: text.trim() }); fullText += text + ' '; } } } if (segments.length === 0) { throw new YouTubeError('Could not parse transcript format', 'PARSE_ERROR'); } return { text: fullText.trim(), segments }; } function parseVTTTranscript(vttText: string): Transcript { const segments: TranscriptSegment[] = []; let fullText = ''; // Split by double newline to get cues const cues = vttText.split(/\n\n+/); for (const cue of cues) { // Skip the WEBVTT header and empty cues if (!cue || cue.startsWith('WEBVTT') || cue.trim() === '') { continue; } const lines = cue.trim().split('\n'); if (lines.length >= 2) { // Parse timestamp line (e.g., "00:00:00.000 --> 00:00:02.000") const timeMatch = lines[0].match(/(\d{2}:\d{2}:\d{2}\.\d{3})/); if (timeMatch) { const startTime = parseVTTTime(timeMatch[1]); // Join all lines after the timestamp const text = lines.slice(1).join(' ').trim(); if (text) { segments.push({ start: startTime, text }); fullText += text + ' '; } } } } if (segments.length === 0) { throw new YouTubeError('Could not parse VTT transcript format', 'PARSE_ERROR'); } return { text: fullText.trim(), segments }; } function parseVTTTime(timeStr: string): number { const parts = timeStr.split(':'); const hours = parseInt(parts[0], 10); const minutes = parseInt(parts[1], 10); const seconds = parseFloat(parts[2]); return hours * 3600 + minutes * 60 + seconds; } function decodeXML(text: string): string { return text .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/'/g, "'") .replace(///g, '/') .replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(Number(dec))) .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => String.fromCharCode(parseInt(hex, 16))); }

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Limecooler/yt-video-info'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

scraper.ts•13.4 KiB