/**
* YouTube transcript fetching module
* Uses web scraping to fetch public video transcripts
*/
import { YoutubeTranscript } from 'youtube-transcript';
import { Transcript, TranscriptEntry } from './types.js';
/**
* Fetches the transcript for a YouTube video
* @param videoId YouTube video ID
* @param language Optional language code (default: 'en')
* @returns Transcript object with entries and full text
*/
export async function getVideoTranscript(
videoId: string,
language: string = 'en'
): Promise<Transcript> {
try {
// Fetch transcript using youtube-transcript package
const transcriptData = await YoutubeTranscript.fetchTranscript(videoId, {
lang: language,
});
// Convert to our Transcript format
const entries: TranscriptEntry[] = transcriptData.map((entry) => ({
text: entry.text,
offset: entry.offset,
duration: entry.duration,
}));
// Create full text by joining all entries
const fullText = entries.map((entry) => entry.text).join(' ');
return {
videoId,
language,
entries,
fullText,
};
} catch (error) {
// Handle common errors
if (error instanceof Error) {
if (error.message.includes('Could not find captions')) {
throw new Error(
`No transcript available for video ${videoId}. The video may not have captions enabled.`
);
}
if (error.message.includes('Video unavailable')) {
throw new Error(
`Video ${videoId} is unavailable or private. Cannot fetch transcript.`
);
}
throw new Error(`Failed to fetch transcript: ${error.message}`);
}
throw new Error('Failed to fetch transcript: Unknown error');
}
}
/**
* Formats a transcript with timestamps for better readability
* @param transcript Transcript object
* @param includeTimestamps Whether to include timestamps (default: true)
* @returns Formatted transcript string
*/
export function formatTranscript(
transcript: Transcript,
includeTimestamps: boolean = true
): string {
if (!includeTimestamps) {
return transcript.fullText;
}
const lines = transcript.entries.map((entry) => {
const timestamp = formatTimestamp(entry.offset);
return `[${timestamp}] ${entry.text}`;
});
return lines.join('\n');
}
/**
* Converts milliseconds to MM:SS or HH:MM:SS format
* @param ms Milliseconds
* @returns Formatted timestamp string
*/
function formatTimestamp(ms: number): string {
const totalSeconds = Math.floor(ms / 1000);
const hours = Math.floor(totalSeconds / 3600);
const minutes = Math.floor((totalSeconds % 3600) / 60);
const seconds = totalSeconds % 60;
if (hours > 0) {
return `${hours}:${minutes.toString().padStart(2, '0')}:${seconds
.toString()
.padStart(2, '0')}`;
}
return `${minutes}:${seconds.toString().padStart(2, '0')}`;
}