YouTube Knowledge MCP

transcript.ts•9.94 kB

import { YouTubeTranscript } from '../types.js'; import { Logger } from 'winston'; import axios from 'axios'; /** * Utility class for processing YouTube video transcripts */ export class TranscriptProcessor { private logger: Logger; constructor(logger: Logger) { this.logger = logger; } /** * Extract transcript from a YouTube video * Note: This is a simplified implementation. For production use, * consider using libraries like youtube-transcript-api */ async extractTranscript(videoId: string): Promise<YouTubeTranscript[]> { try { // First, try to get the video page to extract caption tracks const videoUrl = `https://www.youtube.com/watch?v=${videoId}`; const response = await axios.get(videoUrl, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' }, timeout: 10000 }); const htmlContent = response.data; // Extract caption tracks from the HTML const captionTracks = this.extractCaptionTracks(htmlContent); if (captionTracks.length === 0) { this.logger.warn(`No captions found for video ${videoId}`); return []; } // Try to get English captions first, fallback to any available const preferredTrack = this.selectPreferredTrack(captionTracks); if (!preferredTrack) { this.logger.warn(`No suitable caption track found for video ${videoId}`); return []; } // Fetch the caption data const transcript = await this.fetchCaptionData(preferredTrack.baseUrl); this.logger.info(`Successfully extracted transcript for video ${videoId}: ${transcript.length} segments`); return transcript; } catch (error) { this.logger.error(`Failed to extract transcript for video ${videoId}:`, error); return []; } } /** * Extract caption track URLs from YouTube page HTML */ private extractCaptionTracks(html: string): any[] { try { // Look for the player response JSON in the HTML const playerResponseMatch = html.match(/"player":\s*({.+?}),"videoDetails"/); if (!playerResponseMatch) { this.logger.warn('Could not find player response in HTML'); return []; } const playerResponse = JSON.parse(playerResponseMatch[1]); const captionTracks = playerResponse?.captions?.playerCaptionsTracklistRenderer?.captionTracks || []; return captionTracks; } catch (error) { this.logger.error('Error parsing caption tracks:', error); return []; } } /** * Select the preferred caption track (English if available) */ private selectPreferredTrack(tracks: any[]): any | null { if (tracks.length === 0) return null; // Prefer English tracks const englishTrack = tracks.find(track => track.languageCode === 'en' || track.languageCode === 'en-US' || track.languageCode === 'en-GB' ); if (englishTrack) return englishTrack; // Fallback to auto-generated English const autoEnglishTrack = tracks.find(track => track.languageCode?.startsWith('en') && track.kind === 'asr' ); if (autoEnglishTrack) return autoEnglishTrack; // Fallback to first available track return tracks[0]; } /** * Fetch caption data from the track URL */ private async fetchCaptionData(baseUrl: string): Promise<YouTubeTranscript[]> { try { const response = await axios.get(baseUrl, { timeout: 10000 }); const xmlContent = response.data; return this.parseTranscriptXML(xmlContent); } catch (error) { this.logger.error('Failed to fetch caption data:', error); return []; } } /** * Parse YouTube's transcript XML format */ private parseTranscriptXML(xml: string): YouTubeTranscript[] { const transcript: YouTubeTranscript[] = []; try { // Simple regex parsing for transcript segments // In production, consider using a proper XML parser const segments = xml.match(/<text[^>]*start="([^"]*)"[^>]*dur="([^"]*)"[^>]*>([^<]*)<\/text>/g) || []; for (const segment of segments) { const startMatch = segment.match(/start="([^"]*)"/); const durationMatch = segment.match(/dur="([^"]*)"/); const textMatch = segment.match(/>([^<]*)</); if (startMatch && durationMatch && textMatch) { const start = parseFloat(startMatch[1]); const duration = parseFloat(durationMatch[1]); const text = this.decodeHTMLEntities(textMatch[1]); transcript.push({ text: text.trim(), start, duration }); } } } catch (error) { this.logger.error('Error parsing transcript XML:', error); } return transcript; } /** * Decode HTML entities in transcript text */ private decodeHTMLEntities(text: string): string { const entities = { '&': '&', '<': '<', '>': '>', '"': '"', ''': "'", ' ': ' ' }; return text.replace(/&[#\w]+;/g, (entity) => { return entities[entity] || entity; }); } /** * Process transcript for better readability */ processTranscript(transcript: YouTubeTranscript[]): { fullText: string; paragraphs: string[]; summary: { totalDuration: number; wordCount: number; sentenceCount: number; }; } { if (transcript.length === 0) { return { fullText: '', paragraphs: [], summary: { totalDuration: 0, wordCount: 0, sentenceCount: 0 } }; } const fullText = transcript.map(segment => segment.text).join(' '); const sentences = this.splitIntoSentences(fullText); const paragraphs = this.groupIntoParagraphs(sentences); const totalDuration = transcript[transcript.length - 1]?.start + transcript[transcript.length - 1]?.duration || 0; const wordCount = fullText.split(/\s+/).filter(word => word.length > 0).length; const sentenceCount = sentences.length; return { fullText, paragraphs, summary: { totalDuration, wordCount, sentenceCount } }; } /** * Search for specific content in transcript */ searchTranscript(transcript: YouTubeTranscript[], query: string): { matches: Array<{ text: string; start: number; duration: number; context: string; }>; totalMatches: number; } { const matches = []; const queryLower = query.toLowerCase(); for (let i = 0; i < transcript.length; i++) { const segment = transcript[i]; if (segment.text.toLowerCase().includes(queryLower)) { // Get context (previous and next segments) const contextStart = Math.max(0, i - 2); const contextEnd = Math.min(transcript.length - 1, i + 2); const context = transcript .slice(contextStart, contextEnd + 1) .map(s => s.text) .join(' '); matches.push({ text: segment.text, start: segment.start, duration: segment.duration, context }); } } return { matches, totalMatches: matches.length }; } /** * Extract key topics from transcript */ extractTopics(transcript: YouTubeTranscript[]): string[] { if (transcript.length === 0) return []; const fullText = transcript.map(segment => segment.text).join(' ').toLowerCase(); // Simple keyword extraction - in production, use more sophisticated NLP const words = fullText.split(/\s+/) .filter(word => word.length > 3) .filter(word => !this.isStopWord(word)); // Count word frequency const wordCount: Record<string, number> = {}; words.forEach(word => { wordCount[word] = (wordCount[word] || 0) + 1; }); // Get top 20 most frequent words const topics = Object.entries(wordCount) .sort(([, a], [, b]) => b - a) .slice(0, 20) .map(([word]) => word); return topics; } /** * Split text into sentences */ private splitIntoSentences(text: string): string[] { return text .split(/[.!?]+/) .map(sentence => sentence.trim()) .filter(sentence => sentence.length > 0); } /** * Group sentences into paragraphs */ private groupIntoParagraphs(sentences: string[]): string[] { const paragraphs = []; const sentencesPerParagraph = 4; for (let i = 0; i < sentences.length; i += sentencesPerParagraph) { const paragraph = sentences .slice(i, i + sentencesPerParagraph) .join('. ') + '.'; paragraphs.push(paragraph); } return paragraphs; } /** * Check if word is a stop word */ private isStopWord(word: string): boolean { const stopWords = new Set([ 'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'they', 'we', 'say', 'her', 'she', 'or', 'an', 'will', 'my', 'one', 'all', 'would', 'there', 'their', 'what', 'so', 'up', 'out', 'if', 'about', 'who', 'get', 'which', 'go', 'me', 'when', 'make', 'can', 'like', 'time', 'no', 'just', 'him', 'know', 'take', 'people', 'into', 'year', 'your', 'good', 'some', 'could', 'them', 'see', 'other', 'than', 'then', 'now', 'look', 'only', 'come', 'its', 'over', 'think', 'also', 'back', 'after', 'use', 'two', 'how', 'our', 'work', 'first', 'well', 'way', 'even', 'new', 'want', 'because', 'any', 'these', 'give', 'day', 'most', 'us' ]); return stopWords.has(word.toLowerCase()); } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/efikuta/youtube-knowledge-mcp'

If you have feedback or need assistance with the MCP directory API, please join our Discord server