process-large-srt.js•6.5 kB
#!/usr/bin/env node
/**
* Large SRT File Processor
* Processes large SRT files using MCP tools for conversation timing alignment
*/
const fs = require('fs');
const path = require('path');
class LargeSRTProcessor {
constructor(inputFile, outputFile) {
this.inputFile = inputFile;
this.outputFile = outputFile;
this.chunkSize = 100; // Process 100 subtitles at a time
this.processedCount = 0;
this.totalCount = 0;
}
async processFile() {
try {
console.log(`Starting to process ${this.inputFile}...`);
// Read the input file
const content = fs.readFileSync(this.inputFile, 'utf8');
console.log(`File loaded: ${content.length} characters`);
// Parse the SRT content
const srtData = this.parseSRT(content);
this.totalCount = srtData.length;
console.log(`Found ${this.totalCount} subtitle entries`);
// Process in chunks
const processedEntries = [];
for (let i = 0; i < srtData.length; i += this.chunkSize) {
const chunk = srtData.slice(i, i + this.chunkSize);
console.log(`Processing chunk ${Math.floor(i / this.chunkSize) + 1}/${Math.ceil(srtData.length / this.chunkSize)} (${chunk.length} entries)`);
const processedChunk = await this.processChunk(chunk);
processedEntries.push(...processedChunk);
this.processedCount += chunk.length;
console.log(`Progress: ${this.processedCount}/${this.totalCount} (${Math.round((this.processedCount / this.totalCount) * 100)}%)`);
}
// Write the processed file
const outputContent = this.writeSRT(processedEntries);
fs.writeFileSync(this.outputFile, outputContent, 'utf8');
console.log(`Processing complete! Output saved to ${this.outputFile}`);
console.log(`Processed ${processedEntries.length} subtitle entries`);
} catch (error) {
console.error('Error processing file:', error);
throw error;
}
}
parseSRT(content) {
const entries = [];
const blocks = content.split(/\n\s*\n/);
for (const block of blocks) {
if (block.trim()) {
const lines = block.trim().split('\n');
if (lines.length >= 3) {
const entry = {
index: parseInt(lines[0]),
timing: lines[1],
text: lines.slice(2).join('\n')
};
entries.push(entry);
}
}
}
return entries;
}
writeSRT(entries) {
return entries.map(entry =>
`${entry.index}\n${entry.timing}\n${entry.text}\n`
).join('\n');
}
async processChunk(chunk) {
// This is where you would integrate with MCP tools
// For now, we'll simulate the processing
const processedChunk = [];
for (const entry of chunk) {
// Simulate conversation detection and timing alignment
const processedEntry = {
...entry,
// Add conversation timing alignment logic here
timing: this.alignConversationTiming(entry.timing, entry.text)
};
processedChunk.push(processedEntry);
}
return processedChunk;
}
alignConversationTiming(timing, text) {
// Basic conversation timing alignment logic
// This would be enhanced with actual MCP tool integration
// Detect if this is a conversation turn
const isConversation = this.detectConversation(text);
if (isConversation) {
// Adjust timing for conversation flow
return this.adjustTimingForConversation(timing);
}
return timing;
}
detectConversation(text) {
// Simple conversation detection
const conversationIndicators = [
/<b>Speaker \d+:<\/b>/i,
/^[A-Z][^.!?]*[?!]/,
/^[A-Z][^.!?]*\?/,
/^[A-Z][^.!?]*!/
];
return conversationIndicators.some(pattern => pattern.test(text));
}
adjustTimingForConversation(timing) {
// Basic timing adjustment for conversation flow
// This would be enhanced with actual conversation analysis
const [start, end] = timing.split(' --> ');
const startTime = this.parseTime(start);
const endTime = this.parseTime(end);
// Add small buffer for conversation flow
const buffer = 200; // 200ms buffer
const adjustedEnd = endTime + buffer;
return `${this.formatTime(startTime)} --> ${this.formatTime(adjustedEnd)}`;
}
parseTime(timeStr) {
const [time, ms] = timeStr.split(',');
const [hours, minutes, seconds] = time.split(':').map(Number);
return (hours * 3600 + minutes * 60 + seconds) * 1000 + parseInt(ms);
}
formatTime(milliseconds) {
const totalSeconds = Math.floor(milliseconds / 1000);
const ms = milliseconds % 1000;
const hours = Math.floor(totalSeconds / 3600);
const minutes = Math.floor((totalSeconds % 3600) / 60);
const seconds = totalSeconds % 60;
return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')},${ms.toString().padStart(3, '0')}`;
}
}
// Main execution
async function main() {
const args = process.argv.slice(2);
if (args.length < 2) {
console.log('Usage: node process-large-srt.js <input-file> <output-file>');
console.log('Example: node process-large-srt.js Arabic_Rephrased_Full.srt Arabic_Rephrased_Full_Aligned.srt');
process.exit(1);
}
const [inputFile, outputFile] = args;
if (!fs.existsSync(inputFile)) {
console.error(`Input file not found: ${inputFile}`);
process.exit(1);
}
const processor = new LargeSRTProcessor(inputFile, outputFile);
await processor.processFile();
}
if (require.main === module) {
main().catch(console.error);
}
module.exports = LargeSRTProcessor;