Sandbox MCP Server

  • src
#!/usr/bin/env node import express, { Request, Response } from 'express'; import cors from 'cors'; import { exec } from 'child_process'; import { promisify } from 'util'; import * as path from 'path'; import * as fs from 'fs'; import * as net from 'net'; import OpenAI from 'openai'; import dotenv from 'dotenv'; dotenv.config(); const execAsync = promisify(exec); // Configuration const DEFAULT_VOICE = 'Microsoft Jenny(Natural) - English (United States)'; const DEFAULT_TIMEOUT = parseInt(process.env.TIMEOUT || '30000', 10); const DEFAULT_PORT = 3000; // Initialize OpenAI const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); // Type definitions for request arguments interface TextToSpeechArgs { text: string; voice?: string; speed?: number; } interface SpeechToTextArgs { duration?: number; } interface ChatArgs { message: string; voice?: string; speed?: number; } // Helper function to find an available port async function findAvailablePort(startPort: number): Promise<number> { const isPortAvailable = (port: number): Promise<boolean> => { return new Promise((resolve) => { const server = net.createServer() .once('error', () => resolve(false)) .once('listening', () => { server.close(); resolve(true); }) .listen(port); }); }; for (let port = startPort; port < startPort + 100; port++) { if (await isPortAvailable(port)) { return port; } } throw new Error('No available ports found'); } // Helper function to get available Windows voices async function getWindowsVoices(): Promise<string[]> { try { const { stdout } = await execAsync('powershell -Command "Add-Type -AssemblyName System.Speech; (New-Object System.Speech.Synthesis.SpeechSynthesizer).GetInstalledVoices().VoiceInfo.Name"', { timeout: DEFAULT_TIMEOUT }); return stdout.split('\n').map(v => v.trim()).filter(Boolean); } catch (error) { console.error('Error getting voices:', error); return [DEFAULT_VOICE]; } } // Helper function to speak text using Windows TTS async function speakText(text: string, voice: string = DEFAULT_VOICE, speed: number = 1.0): Promise<void> { const script = ` Add-Type -AssemblyName System.Speech; $synthesizer = New-Object System.Speech.Synthesis.SpeechSynthesizer; $synthesizer.SelectVoice('${voice}'); $synthesizer.Rate = ${Math.round((speed - 1) * 10)}; $synthesizer.Speak('${text.replace(/'/g, "''")}'); `; await execAsync(`powershell -Command "${script}"`, { timeout: DEFAULT_TIMEOUT }); } // Helper function to get GPT-4 response async function getChatResponse(message: string): Promise<string> { try { const completion = await openai.chat.completions.create({ model: "gpt-4", messages: [ { role: "system", content: "You are a helpful assistant. Keep your responses concise and natural, as they will be spoken aloud." }, { role: "user", content: message } ], temperature: 0.7, max_tokens: 150 }); return completion.choices[0]?.message?.content || "I'm sorry, I couldn't generate a response."; } catch (error) { console.error('Error getting GPT-4 response:', error); throw error; } } // Initialize Express app const app = express(); app.use(cors()); app.use(express.json()); app.use(express.static('test')); // Add timeout middleware app.use((req: Request, res: Response, next) => { res.setTimeout(DEFAULT_TIMEOUT, () => { res.status(408).json({ error: 'Request timeout' }); }); next(); }); // Get available voices app.get('/voices', async (_req: Request, res: Response) => { try { const voices = await getWindowsVoices(); res.json(voices); } catch (error) { res.status(500).json({ error: error instanceof Error ? error.message : String(error) }); } }); // Text to Speech app.post('/tts', async (req: Request<{}, {}, TextToSpeechArgs>, res: Response) => { try { const { text, voice = DEFAULT_VOICE, speed = 1.0 } = req.body; if (!text) { return res.status(400).json({ error: 'Text is required' }); } await speakText(text, voice, speed); res.json({ success: true }); } catch (error) { if (error instanceof Error && error.message.includes('timeout')) { res.status(408).json({ error: 'Operation timed out' }); } else { res.status(500).json({ error: error instanceof Error ? error.message : String(error) }); } } }); // Speech to Text app.post('/stt', async (req: Request<{}, {}, SpeechToTextArgs>, res: Response) => { try { const { duration = 5 } = req.body; const audioFile = path.join(__dirname, 'recording.wav'); // Record audio using PowerShell const recordScript = ` Add-Type -AssemblyName System.Windows.Forms; $audio = New-Object System.IO.MemoryStream; $waveSource = New-Object NAudio.Wave.WaveInEvent; $waveSource.WaveFormat = New-Object NAudio.Wave.WaveFormat(16000, 1); $waveFile = New-Object NAudio.Wave.WaveFileWriter('${audioFile}', $waveSource.WaveFormat); $waveSource.DataAvailable = { param($sender, $e) $waveFile.Write($e.Buffer, 0, $e.BytesRecorded) }; $waveSource.StartRecording(); Start-Sleep -Seconds ${duration}; $waveSource.StopRecording(); $waveFile.Dispose(); `; await execAsync(recordScript, { timeout: DEFAULT_TIMEOUT + (duration * 1000) }); // Transcribe the recorded audio const transcribeScript = ` Add-Type -AssemblyName System.Speech; $recognizer = New-Object System.Speech.Recognition.SpeechRecognizer; $grammar = New-Object System.Speech.Recognition.DictationGrammar; $recognizer.LoadGrammar($grammar); $audio = [System.IO.File]::ReadAllBytes('${audioFile}'); $stream = New-Object System.IO.MemoryStream(@(,$audio)); $result = $recognizer.RecognizeSync([System.Speech.AudioFormat.AudioStream]::new($stream)); $result.Text; `; const { stdout } = await execAsync(`powershell -Command "${transcribeScript}"`, { timeout: DEFAULT_TIMEOUT }); // Clean up the audio file await fs.promises.unlink(audioFile); res.json({ text: stdout.trim() || 'No speech detected' }); } catch (error) { // Clean up the audio file if it exists const audioFile = path.join(__dirname, 'recording.wav'); if (fs.existsSync(audioFile)) { await fs.promises.unlink(audioFile); } if (error instanceof Error && error.message.includes('timeout')) { res.status(408).json({ error: 'Operation timed out' }); } else { res.status(500).json({ error: error instanceof Error ? error.message : String(error) }); } } }); // Chat endpoint that gets GPT-4 response and speaks it app.post('/chat', async (req: Request<{}, {}, ChatArgs>, res: Response) => { try { const { message, voice = DEFAULT_VOICE, speed = 1.0 } = req.body; if (!message) { return res.status(400).json({ error: 'Message is required' }); } // Get GPT-4 response const response = await getChatResponse(message); // Speak the response await speakText(response, voice, speed); res.json({ success: true, response, spoken: true }); } catch (error) { if (error instanceof Error && error.message.includes('timeout')) { res.status(408).json({ error: 'Operation timed out' }); } else { res.status(500).json({ error: error instanceof Error ? error.message : String(error) }); } } }); // Start the server async function startServer() { try { const port = await findAvailablePort(DEFAULT_PORT); app.listen(port, () => { console.log(`Windows Speech Server running at http://localhost:${port}`); console.log(`Using default voice: ${DEFAULT_VOICE}`); console.log(`Timeout set to: ${DEFAULT_TIMEOUT}ms`); console.log('GPT-4 integration enabled'); }); } catch (error) { console.error('Failed to start server:', error); process.exit(1); } } startServer();