Skip to main content
Glama

Git MCP Server

whisper.provider.ts8.25 kB
/** * @fileoverview OpenAI Whisper speech-to-text provider implementation. * @module src/services/speech/providers/whisper */ import { injectable } from 'tsyringe'; import { JsonRpcErrorCode, McpError } from '@/types-global/errors.js'; import { fetchWithTimeout, logger, requestContextService, } from '@/utils/index.js'; import type { ISpeechProvider } from '../core/ISpeechProvider.js'; import type { SpeechProviderConfig, SpeechToTextOptions, SpeechToTextResult, TextToSpeechOptions, TextToSpeechResult, Voice, WordTimestamp, } from '../types.js'; /** * OpenAI Whisper API response for transcription. */ interface WhisperTranscriptionResponse { text: string; task?: string; language?: string; duration?: number; words?: Array<{ word: string; start: number; end: number; }>; } /** * OpenAI Whisper STT provider. * Supports high-quality speech-to-text transcription in multiple languages. */ @injectable() export class WhisperProvider implements ISpeechProvider { public readonly name = 'openai-whisper'; public readonly supportsTTS = false; public readonly supportsSTT = true; private readonly apiKey: string; private readonly baseUrl: string; private readonly defaultModelId: string; private readonly timeout: number; constructor(config: SpeechProviderConfig) { if (!config.apiKey) { throw new McpError( JsonRpcErrorCode.InvalidParams, 'OpenAI API key is required', ); } this.apiKey = config.apiKey; this.baseUrl = config.baseUrl || 'https://api.openai.com/v1'; this.defaultModelId = config.defaultModelId || 'whisper-1'; this.timeout = config.timeout || 60000; // Longer timeout for audio processing logger.info( `OpenAI Whisper STT provider initialized: ${this.baseUrl}, model=${this.defaultModelId}`, ); } /** * Text-to-speech is not supported by Whisper. */ textToSpeech(_options: TextToSpeechOptions): Promise<TextToSpeechResult> { throw new McpError( JsonRpcErrorCode.MethodNotFound, 'Text-to-speech is not supported by Whisper provider', ); } /** * Convert speech audio to text using OpenAI Whisper API. */ async speechToText( options: SpeechToTextOptions, ): Promise<SpeechToTextResult> { const context = requestContextService.createRequestContext({ operation: 'whisper-stt', ...(options.context || {}), }); const modelId = options.modelId || this.defaultModelId; logger.debug('Converting speech to text with Whisper', context); // Validate audio input if (!options.audio) { throw new McpError( JsonRpcErrorCode.InvalidParams, 'Audio data is required', context, ); } // Convert audio to Buffer if it's a base64 string let audioBuffer: Buffer; if (typeof options.audio === 'string') { try { audioBuffer = Buffer.from(options.audio, 'base64'); } catch (_error) { throw new McpError( JsonRpcErrorCode.InvalidParams, 'Invalid base64 audio data', context, ); } } else { audioBuffer = options.audio; } // Check file size (Whisper has a 25MB limit) const maxSize = 25 * 1024 * 1024; // 25MB if (audioBuffer.length > maxSize) { throw new McpError( JsonRpcErrorCode.InvalidParams, `Audio file exceeds maximum size of 25MB (got ${Math.round(audioBuffer.length / 1024 / 1024)}MB)`, context, ); } const url = `${this.baseUrl}/audio/transcriptions`; // Build form data const formData = new FormData(); // Determine filename with appropriate extension const extension = this.getFileExtension(options.format); const blob = new Blob([audioBuffer], { type: this.getMimeType(options.format), }); formData.append('file', blob, `audio.${extension}`); formData.append('model', modelId); if (options.language) { formData.append('language', options.language); } if (options.temperature !== undefined) { formData.append('temperature', options.temperature.toString()); } if (options.prompt) { formData.append('prompt', options.prompt); } // Request verbose JSON format to get timestamps and metadata formData.append( 'response_format', options.timestamps ? 'verbose_json' : 'json', ); if (options.timestamps) { formData.append('timestamp_granularities[]', 'word'); } try { const response = await fetchWithTimeout(url, this.timeout, context, { method: 'POST', headers: { Authorization: `Bearer ${this.apiKey}`, // Don't set Content-Type - let fetch set it with boundary for FormData }, body: formData, }); if (!response.ok) { const errorText = await response.text(); logger.error(`Whisper API error: ${response.status}`, context); throw new McpError( JsonRpcErrorCode.InternalError, `Whisper API error: ${response.status} - ${errorText}`, context, ); } const data = (await response.json()) as WhisperTranscriptionResponse; // Convert word timestamps if present const words: WordTimestamp[] | undefined = data.words?.map((w) => ({ word: w.word, start: w.start, end: w.end, })); logger.info( `Speech-to-text transcription successful (${data.text.length} chars)`, context, ); return { text: data.text, ...(data.language !== undefined && { language: data.language }), ...(data.duration !== undefined && { duration: data.duration }), ...(words !== undefined && { words }), metadata: { modelId, provider: this.name, ...(data.task !== undefined && { task: data.task }), }, }; } catch (error) { if (error instanceof McpError) { throw error; } logger.error( 'Failed to transcribe audio', error instanceof Error ? error : new Error(String(error)), context, ); throw new McpError( JsonRpcErrorCode.InternalError, `Failed to transcribe audio: ${error instanceof Error ? error.message : 'Unknown error'}`, context, ); } } /** * Get voices is not applicable for STT providers. */ getVoices(): Promise<Voice[]> { throw new McpError( JsonRpcErrorCode.MethodNotFound, 'Voice listing is not supported by Whisper provider (STT only)', ); } /** * Health check for OpenAI Whisper API. */ async healthCheck(): Promise<boolean> { try { // Simple health check: verify API key by making a models list request const context = requestContextService.createRequestContext({ operation: 'whisper-healthCheck', }); const response = await fetchWithTimeout( `${this.baseUrl}/models`, 5000, context, { method: 'GET', headers: { Authorization: `Bearer ${this.apiKey}`, }, }, ); return response.ok; } catch (error) { const context = requestContextService.createRequestContext({ operation: 'whisper-healthCheck', }); logger.error( 'Whisper health check failed', error instanceof Error ? error : new Error(String(error)), context, ); return false; } } /** * Get file extension for audio format. */ private getFileExtension(format?: string): string { const formatMap: Record<string, string> = { mp3: 'mp3', wav: 'wav', ogg: 'ogg', flac: 'flac', webm: 'webm', m4a: 'm4a', }; return format && formatMap[format] ? formatMap[format] : 'mp3'; } /** * Get MIME type for audio format. */ private getMimeType(format?: string): string { const mimeMap: Record<string, string> = { mp3: 'audio/mpeg', wav: 'audio/wav', ogg: 'audio/ogg', flac: 'audio/flac', webm: 'audio/webm', m4a: 'audio/mp4', }; return format && mimeMap[format] ? mimeMap[format] : 'audio/mpeg'; } }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/cyanheads/git-mcp-server'

If you have feedback or need assistance with the MCP directory API, please join our Discord server