srt-parser.ts•9.19 kB
/**
* SRT file parser with validation and error handling
*/
import { SRTFile, SRTSubtitle, SRTValidationError, SRTProcessingResult, SRTTime } from '../types/srt.js';
import { parseSRTTimeRange, validateTimeSequence, parseSRTTime } from '../utils/time-parser.js';
import { validateStyleTags } from '../utils/style-tags.js';
/**
* Parse SRT file content
*/
export function parseSRTFile(content: string): SRTProcessingResult {
const errors: SRTValidationError[] = [];
const warnings: string[] = [];
const subtitles: SRTSubtitle[] = [];
try {
const blocks = content.trim().split(/\n\s*\n/);
let lineNumber = 1;
for (const block of blocks) {
if (!block.trim()) continue;
const lines = block.split('\n');
const subtitle = parseSubtitleBlock(lines, lineNumber);
if (subtitle) {
subtitles.push(subtitle);
} else {
errors.push({
line: lineNumber,
message: 'Failed to parse subtitle block',
type: 'format'
});
}
lineNumber += lines.length + 1; // +1 for empty line separator
}
// Validate timing sequences
validateTimingSequences(subtitles, errors);
// Validate style tags
validateAllStyleTags(subtitles, errors, warnings);
return {
success: errors.length === 0,
file: {
subtitles,
metadata: extractMetadata(content)
},
errors: errors.length > 0 ? errors : undefined,
warnings: warnings.length > 0 ? warnings : undefined
};
} catch (error) {
return {
success: false,
errors: [{
line: 1,
message: `Parse error: ${error instanceof Error ? error.message : 'Unknown error'}`,
type: 'format'
}]
};
}
}
/**
* Parse individual subtitle block
*/
function parseSubtitleBlock(lines: string[], startLine: number): SRTSubtitle | null {
if (lines.length < 3) {
return null;
}
// Parse index - be more flexible with index parsing
const indexLine = lines[0].trim();
const index = parseInt(indexLine, 10);
if (isNaN(index) || index < 1) {
// Try to extract number from line if it has extra text
const numberMatch = indexLine.match(/(\d+)/);
if (numberMatch) {
const extractedIndex = parseInt(numberMatch[1], 10);
if (!isNaN(extractedIndex) && extractedIndex > 0) {
// Use extracted number but log warning
console.warn(`Warning: Subtitle index line "${indexLine}" contains extra text, using extracted number: ${extractedIndex}`);
} else {
return null;
}
} else {
return null;
}
}
// Parse time range - be more flexible with time format
const timeRange = lines[1].trim();
let startTime: SRTTime | undefined;
let endTime: SRTTime | undefined;
try {
// Check if time range contains the arrow separator
if (!timeRange.includes('-->')) {
// Try to find arrow in different formats
const arrowFormats = ['-->', '->', '→', '—>'];
let foundArrow = false;
for (const arrow of arrowFormats) {
if (timeRange.includes(arrow)) {
// Replace with standard arrow
const normalizedRange = timeRange.replace(arrow, '-->');
const times = parseSRTTimeRange(normalizedRange);
startTime = times.start;
endTime = times.end;
foundArrow = true;
break;
}
}
if (!foundArrow) {
return null;
}
} else {
const times = parseSRTTimeRange(timeRange);
startTime = times.start;
endTime = times.end;
}
} catch (error) {
// Try to parse with more flexible time format
try {
const times = parseFlexibleTimeRange(timeRange);
startTime = times.start;
endTime = times.end;
} catch (flexError) {
return null;
}
}
// Ensure we have valid times
if (!startTime || !endTime) {
return null;
}
// Parse text content
const textLines = lines.slice(2);
const rawText = textLines.join('\n');
const text = rawText; // For now, preserve all formatting
return {
index,
startTime,
endTime,
text,
rawText
};
}
/**
* Parse time range with more flexible format handling
*/
function parseFlexibleTimeRange(timeRange: string): { start: SRTTime; end: SRTTime } {
// Try different arrow formats
const arrowFormats = ['-->', '->', '→', '—>', ' - '];
let parts: string[] = [];
for (const arrow of arrowFormats) {
if (timeRange.includes(arrow)) {
parts = timeRange.split(arrow);
break;
}
}
if (parts.length !== 2) {
throw new Error(`Invalid time range format: ${timeRange}`);
}
return {
start: parseFlexibleTime(parts[0].trim()),
end: parseFlexibleTime(parts[1].trim())
};
}
/**
* Parse time with more flexible format handling
*/
function parseFlexibleTime(timeString: string): SRTTime {
// Try standard format first
try {
return parseSRTTime(timeString);
} catch (error) {
// Try alternative formats
const formats = [
/^(\d{1,2}):(\d{2}):(\d{2})\.(\d{3})$/, // HH:MM:SS.mmm
/^(\d{1,2}):(\d{2}):(\d{2}),(\d{3})$/, // HH:MM:SS,mmm
/^(\d{1,2}):(\d{2}):(\d{2})$/, // HH:MM:SS
/^(\d{1,2}):(\d{2}):(\d{2})\.(\d{1,3})$/, // HH:MM:SS.mmm (flexible ms)
];
for (const format of formats) {
const match = timeString.match(format);
if (match) {
const [, hours, minutes, seconds, milliseconds] = match;
return {
hours: parseInt(hours, 10),
minutes: parseInt(minutes, 10),
seconds: parseInt(seconds, 10),
milliseconds: milliseconds ? parseInt(milliseconds.padEnd(3, '0'), 10) : 0
};
}
}
throw new Error(`Unable to parse time format: ${timeString}`);
}
}
/**
* Validate timing sequences are in ascending order
*/
function validateTimingSequences(subtitles: SRTSubtitle[], errors: SRTValidationError[]): void {
for (let i = 0; i < subtitles.length; i++) {
const subtitle = subtitles[i];
// Validate start/end time relationship
if (!validateTimeSequence(subtitle.startTime, subtitle.endTime)) {
errors.push({
line: i * 4 + 2, // Approximate line number
message: `Subtitle ${subtitle.index}: End time must be after start time`,
type: 'timing'
});
}
// Validate sequence with previous subtitle
if (i > 0) {
const prevSubtitle = subtitles[i - 1];
if (!validateTimeSequence(prevSubtitle.endTime, subtitle.startTime)) {
errors.push({
line: i * 4 + 2,
message: `Subtitle ${subtitle.index}: Start time must be after previous subtitle end time`,
type: 'timing'
});
}
}
}
}
/**
* Validate style tags in all subtitles
*/
function validateAllStyleTags(
subtitles: SRTSubtitle[],
errors: SRTValidationError[],
warnings: string[]
): void {
for (const subtitle of subtitles) {
const validation = validateStyleTags(subtitle.text);
if (!validation.valid) {
errors.push({
line: subtitle.index * 4 + 3, // Approximate line number
message: `Subtitle ${subtitle.index}: Invalid style tags - ${validation.errors.join(', ')}`,
type: 'tag'
});
}
}
}
/**
* Extract metadata from SRT content (if present)
*/
function extractMetadata(content: string): SRTFile['metadata'] {
// Look for common metadata patterns
const titleMatch = content.match(/TITLE:\s*(.+)/i);
const authorMatch = content.match(/AUTHOR:\s*(.+)/i);
const descMatch = content.match(/DESCRIPTION:\s*(.+)/i);
const metadata: SRTFile['metadata'] = {};
if (titleMatch) metadata.title = titleMatch[1].trim();
if (authorMatch) metadata.author = authorMatch[1].trim();
if (descMatch) metadata.description = descMatch[1].trim();
return Object.keys(metadata).length > 0 ? metadata : undefined;
}
/**
* Write SRT file from SRTFile object
*/
export function writeSRTFile(srtFile: SRTFile): string {
const lines: string[] = [];
// Add metadata if present
if (srtFile.metadata) {
if (srtFile.metadata.title) lines.push(`TITLE: ${srtFile.metadata.title}`);
if (srtFile.metadata.author) lines.push(`AUTHOR: ${srtFile.metadata.author}`);
if (srtFile.metadata.description) lines.push(`DESCRIPTION: ${srtFile.metadata.description}`);
if (lines.length > 0) lines.push(''); // Empty line after metadata
}
// Add subtitles
for (const subtitle of srtFile.subtitles) {
lines.push(subtitle.index.toString());
lines.push(`${formatSRTTime(subtitle.startTime)} --> ${formatSRTTime(subtitle.endTime)}`);
lines.push(subtitle.rawText);
lines.push(''); // Empty line separator
}
return lines.join('\n');
}
/**
* Format SRT time for output
*/
function formatSRTTime(time: { hours: number; minutes: number; seconds: number; milliseconds: number }): string {
const pad = (num: number, length: number) => num.toString().padStart(length, '0');
return `${pad(time.hours, 2)}:${pad(time.minutes, 2)}:${pad(time.seconds, 2)},${pad(time.milliseconds, 3)}`;
}