Skip to main content
Glama
scraper.ts13.7 kB
import { VideoMetadata, Transcript, TranscriptSegment, YouTubePlayerResponse, YouTubeError, YouTubeErrorCode, CaptionTrack } from './types.js'; import { VideoIdSchema, YouTubePlayerResponseSchema, TranscriptResponseSchema } from './validation.js'; import { withRetry, isRetryableError } from './utils.js'; import { videoInfoCache, transcriptCache } from './cache.js'; import { logger } from './logger.js'; import { z } from 'zod'; const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; // InnerTube API configuration const INNERTUBE_API_URL = 'https://www.youtube.com/youtubei/v1/player'; const INNERTUBE_CONTEXT = { client: { clientName: 'ANDROID', clientVersion: '20.10.38' } }; export async function fetchVideoInfo(videoId: string): Promise<{ metadata: VideoMetadata; captionTracks: CaptionTrack[] }> { // Validate video ID format with Zod try { VideoIdSchema.parse(videoId); } catch (error) { if (error instanceof z.ZodError) { throw new YouTubeError(error.errors[0].message, 'INVALID_ID'); } throw new YouTubeError('Invalid YouTube video ID format', 'INVALID_ID'); } // Check cache first const cached = videoInfoCache.get(videoId); if (cached) { logger.debug(`Cache hit for video ${videoId}`); return cached; } logger.info(`Fetching video info for ${videoId}`); const url = `https://www.youtube.com/watch?v=${videoId}`; try { // Fetch with retry logic const { html, response } = await withRetry(async () => { const response = await fetch(url, { headers: { 'User-Agent': USER_AGENT, 'Accept-Language': 'en-US,en;q=0.9', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', } }); if (!response.ok) { if (response.status === 404) { throw new YouTubeError('Video not found', 'NOT_FOUND'); } const error = new Error(`HTTP error ${response.status}`) as any; error.status = response.status; throw error; } const html = await response.text(); return { html, response }; }, { maxRetries: 3, baseDelay: 1000 }); // First try to extract InnerTube API key and use InnerTube API const apiKey = extractInnerTubeApiKey(html); let playerResponse: YouTubePlayerResponse | null = null; if (apiKey) { logger.debug('Attempting to fetch via InnerTube API'); playerResponse = await fetchViaInnerTube(videoId, apiKey); } // Fall back to extracting from HTML if InnerTube didn't work if (!playerResponse) { logger.debug('Falling back to HTML extraction'); playerResponse = extractPlayerResponse(html); } if (!playerResponse) { throw new YouTubeError('Could not extract video data from page', 'PARSE_ERROR'); } // Check if video is available if (playerResponse.playabilityStatus?.status !== 'OK') { const status = playerResponse.playabilityStatus?.status; const reason = playerResponse.playabilityStatus?.reason || 'Video is not available'; // Determine specific error type let errorCode: YouTubeErrorCode = 'PRIVATE'; if (reason.toLowerCase().includes('age')) { errorCode = 'AGE_RESTRICTED'; } else if (reason.toLowerCase().includes('region') || reason.toLowerCase().includes('country')) { errorCode = 'REGION_BLOCKED'; } throw new YouTubeError(reason, errorCode); } const metadata = extractMetadata(playerResponse); const captionTracks = extractCaptionTracks(playerResponse); const result = { metadata, captionTracks }; // Cache the result videoInfoCache.set(videoId, result); logger.debug(`Cached video info for ${videoId}`); return result; } catch (error) { logger.error(`Failed to fetch video ${videoId}:`, error); if (error instanceof YouTubeError) { throw error; } throw new YouTubeError( `Failed to fetch video: ${error instanceof Error ? error.message : 'Unknown error'}`, 'NETWORK_ERROR', error ); } } export async function fetchTranscript(captionTrack: CaptionTrack): Promise<Transcript> { // Create cache key from caption track URL const cacheKey = captionTrack.baseUrl; // Check cache first const cached = transcriptCache.get(cacheKey); if (cached) { logger.debug(`Cache hit for transcript`); return cached; } logger.info(`Fetching transcript for language: ${captionTrack.languageCode || 'unknown'}`); try { // Try multiple format options const formats = ['json3', 'srv3', 'vtt']; for (const fmt of formats) { const url = new URL(captionTrack.baseUrl); url.searchParams.set('fmt', fmt); // Try fetching with retry logic for each format let response: Response; let text: string; try { const result = await withRetry(async () => { const resp = await fetch(url.toString(), { headers: { 'User-Agent': USER_AGENT, 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://www.youtube.com/', } }); if (!resp.ok) { const error = new Error(`HTTP error ${resp.status}`) as any; error.status = resp.status; throw error; } const content = await resp.text(); return { response: resp, text: content }; }, { maxRetries: 2, baseDelay: 500 }); response = result.response; text = result.text; } catch (error) { // Skip to next format on error continue; } // Check if we got actual content if (!text || text.length === 0) { continue; } try { // Try to parse based on format let transcript: Transcript; if (fmt === 'vtt') { transcript = parseVTTTranscript(text); } else { transcript = parseTranscript(text); } // Cache the successful result transcriptCache.set(cacheKey, transcript); return transcript; } catch (e) { // Try next format continue; } } throw new YouTubeError( 'Transcript not available. The video may not have captions, or they may be restricted.', 'NO_TRANSCRIPT' ); } catch (error) { if (error instanceof YouTubeError) { throw error; } throw new YouTubeError( 'Failed to fetch transcript', 'NO_TRANSCRIPT', error ); } } function extractPlayerResponse(html: string): YouTubePlayerResponse | null { // Try multiple patterns to extract the player response const patterns = [ /var\s+ytInitialPlayerResponse\s*=\s*({.+?});/s, /ytInitialPlayerResponse\s*=\s*({.+?});/s, /"playerResponse":"(\\?.+?)\\?"/s ]; for (const pattern of patterns) { const match = html.match(pattern); if (match) { try { let jsonStr = match[1]; // If it's from the third pattern, we need to unescape it if (pattern === patterns[2]) { jsonStr = jsonStr.replace(/\\"/g, '"').replace(/\\\\/g, '\\'); } const parsed = JSON.parse(jsonStr); // Validate the response with Zod return YouTubePlayerResponseSchema.parse(parsed); } catch (e) { continue; } } } return null; } function extractInnerTubeApiKey(html: string): string | null { // Try multiple patterns to extract the API key const patterns = [ /"INNERTUBE_API_KEY":"([^"]+)"/, /"innertubeApiKey":"([^"]+)"/, /['"]INNERTUBE_API_KEY['"]:\s*['"]([^'"]+)['"]/, ]; for (const pattern of patterns) { const match = html.match(pattern); if (match && match[1]) { logger.debug(`Found InnerTube API key: ${match[1].substring(0, 10)}...`); return match[1]; } } logger.warn('Could not extract InnerTube API key'); return null; } async function fetchViaInnerTube(videoId: string, apiKey: string): Promise<YouTubePlayerResponse | null> { const url = `${INNERTUBE_API_URL}?key=${apiKey}`; try { const response = await fetch(url, { method: 'POST', headers: { 'User-Agent': USER_AGENT, 'Content-Type': 'application/json', 'Accept': 'application/json', 'Accept-Language': 'en-US,en;q=0.9', 'Origin': 'https://www.youtube.com', 'Referer': `https://www.youtube.com/watch?v=${videoId}`, }, body: JSON.stringify({ context: INNERTUBE_CONTEXT, videoId: videoId, }), }); if (!response.ok) { logger.warn(`InnerTube API returned status ${response.status}`); return null; } const data = await response.json(); // Validate the response with Zod try { return YouTubePlayerResponseSchema.parse(data); } catch (e) { logger.warn('InnerTube response validation failed'); return null; } } catch (error) { logger.error('InnerTube API request failed:', error); return null; } } function extractMetadata(playerResponse: YouTubePlayerResponse): VideoMetadata { const details = playerResponse.videoDetails; if (!details) { throw new YouTubeError('Video details not found', 'PARSE_ERROR'); } return { title: details.title || 'Unknown Title', author: details.author || 'Unknown Author', lengthSeconds: parseInt(details.lengthSeconds || '0', 10), viewCount: parseInt(details.viewCount || '0', 10), description: details.shortDescription || '', }; } function extractCaptionTracks(playerResponse: YouTubePlayerResponse): CaptionTrack[] { const tracks = playerResponse.captions?.playerCaptionsTracklistRenderer?.captionTracks; if (!tracks || tracks.length === 0) { return []; } return tracks.filter(track => track.baseUrl); } function parseTranscript(jsonText: string): Transcript { const segments: TranscriptSegment[] = []; let fullText = ''; try { const data = JSON.parse(jsonText); // Validate and parse the transcript response const validatedData = TranscriptResponseSchema.parse(data); // Handle json3 format from YouTube if (validatedData.events) { for (const event of validatedData.events) { // Skip music events and other non-text events if (!event.segs || !Array.isArray(event.segs)) { continue; } let eventText = ''; for (const seg of event.segs) { if (seg.utf8) { eventText += seg.utf8; } } if (eventText.trim()) { const startTime = (event.tStartMs || 0) / 1000; segments.push({ start: startTime, text: eventText.trim() }); fullText += eventText + ' '; } } } // Handle other potential JSON formats if (segments.length === 0 && validatedData.captions) { for (const caption of validatedData.captions) { if (caption.text) { segments.push({ start: caption.start || 0, text: caption.text }); fullText += caption.text + ' '; } } } } catch (e) { // If not JSON, try XML format const textRegex = /<text[^>]*start="([^"]*)"[^>]*>(.*?)<\/text>/gs; let match; while ((match = textRegex.exec(jsonText)) !== null) { const start = parseFloat(match[1]); const text = decodeXML(match[2]); if (text.trim()) { segments.push({ start, text: text.trim() }); fullText += text + ' '; } } } if (segments.length === 0) { throw new YouTubeError('Could not parse transcript format', 'PARSE_ERROR'); } return { text: fullText.trim(), segments }; } function parseVTTTranscript(vttText: string): Transcript { const segments: TranscriptSegment[] = []; let fullText = ''; // Split by double newline to get cues const cues = vttText.split(/\n\n+/); for (const cue of cues) { // Skip the WEBVTT header and empty cues if (!cue || cue.startsWith('WEBVTT') || cue.trim() === '') { continue; } const lines = cue.trim().split('\n'); if (lines.length >= 2) { // Parse timestamp line (e.g., "00:00:00.000 --> 00:00:02.000") const timeMatch = lines[0].match(/(\d{2}:\d{2}:\d{2}\.\d{3})/); if (timeMatch) { const startTime = parseVTTTime(timeMatch[1]); // Join all lines after the timestamp const text = lines.slice(1).join(' ').trim(); if (text) { segments.push({ start: startTime, text }); fullText += text + ' '; } } } } if (segments.length === 0) { throw new YouTubeError('Could not parse VTT transcript format', 'PARSE_ERROR'); } return { text: fullText.trim(), segments }; } function parseVTTTime(timeStr: string): number { const parts = timeStr.split(':'); const hours = parseInt(parts[0], 10); const minutes = parseInt(parts[1], 10); const seconds = parseFloat(parts[2]); return hours * 3600 + minutes * 60 + seconds; } function decodeXML(text: string): string { return text .replace(/&amp;/g, '&') .replace(/&lt;/g, '<') .replace(/&gt;/g, '>') .replace(/&quot;/g, '"') .replace(/&#39;/g, "'") .replace(/&#x27;/g, "'") .replace(/&#x2F;/g, '/') .replace(/&#(\d+);/g, (match, dec) => String.fromCharCode(Number(dec))) .replace(/&#x([0-9a-fA-F]+);/g, (match, hex) => String.fromCharCode(parseInt(hex, 16))); }

Latest Blog Posts

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/Limecooler/yt-video-info'

If you have feedback or need assistance with the MCP directory API, please join our Discord server