Farnsworth

Overview Schema Related Servers Score Discussions

vtuber_tts.py•11.9 KiB

""" VTuber TTS - Streamlined voice synthesis for Farnsworth VTuber streaming. Single voice (Farnsworth), F5-TTS voice cloning (primary) with XTTS v2 and Edge TTS fallbacks. Pre-loads model at startup for zero-latency first generation. Priority: F5-TTS (fast, high quality) → XTTS v2 → Edge TTS (no cloning) """ import asyncio import hashlib import os import subprocess from pathlib import Path from typing import Optional from loguru import logger # Check available providers at import time F5TTS_AVAILABLE = False XTTS_AVAILABLE = False EDGE_TTS_AVAILABLE = False try: from f5_tts.api import F5TTS as F5TTSModel F5TTS_AVAILABLE = True except ImportError: pass try: from TTS.api import TTS XTTS_AVAILABLE = True except ImportError: pass try: import edge_tts EDGE_TTS_AVAILABLE = True except ImportError: pass class VTuberTTS: """Streamlined TTS for VTuber streaming - Farnsworth voice only. Priority chain: 1. F5-TTS - Fast zero-shot voice cloning (2-3s generation) 2. XTTS v2 - Slower voice cloning fallback (5-25s generation) 3. Edge TTS - No cloning, but fast and reliable """ def __init__( self, reference_audio: str = "", cache_dir: str = "/tmp/vtuber_tts_cache", device: str = "cuda:0", ): self._reference_audio = reference_audio self._cache_dir = Path(cache_dir) self._cache_dir.mkdir(parents=True, exist_ok=True) self._device = device # F5-TTS (primary) self._f5tts_model = None self._f5tts_ready = False self._ref_text = "" # Cached transcription of reference audio # XTTS v2 (secondary fallback) self._xtts_model = None self._xtts_ready = False self._generation_lock = asyncio.Lock() # Edge TTS voice for final fallback self._edge_voice = "en-US-GuyNeural" async def initialize(self) -> bool: """Pre-load TTS models at startup. Call this once.""" has_ref = self._reference_audio and os.path.exists(self._reference_audio) if not has_ref: logger.warning(f"Reference audio not found: {self._reference_audio}") logger.warning("Will use Edge TTS only (no voice cloning)") return EDGE_TTS_AVAILABLE # Try F5-TTS first (primary) if F5TTS_AVAILABLE: try: await self._init_f5tts() except Exception as e: logger.error(f"F5-TTS initialization failed: {e}") self._f5tts_ready = False # Try XTTS as fallback if not self._f5tts_ready and XTTS_AVAILABLE: try: await self._init_xtts() except Exception as e: logger.error(f"XTTS initialization failed: {e}") self._xtts_ready = False ready = self._f5tts_ready or self._xtts_ready or EDGE_TTS_AVAILABLE if self._f5tts_ready: logger.info("VTuberTTS ready: F5-TTS (primary)") elif self._xtts_ready: logger.info("VTuberTTS ready: XTTS v2 (fallback)") elif EDGE_TTS_AVAILABLE: logger.info("VTuberTTS ready: Edge TTS only (no voice cloning)") return ready async def _init_f5tts(self): """Initialize F5-TTS model.""" loop = asyncio.get_event_loop() def load_model(): logger.info("Loading F5-TTS v1 model...") model = F5TTSModel( model="F5TTS_v1_Base", device=self._device if "cuda" in self._device else None, ) logger.info("F5-TTS v1 loaded") return model self._f5tts_model = await loop.run_in_executor(None, load_model) if self._f5tts_model is not None: # Pre-transcribed reference text (Farnsworth voice clips) # Avoids needing Whisper model download at runtime self._ref_text = ( "Any more ridiculous ideas? Are you all right? Bad news, everyone. " "The creature is a shapeshifter. It knocked me out and took my form " "so it could prey on poor Hermes. Damn! Have you ever dissected a yeti before?" ) logger.info(f"Using pre-transcribed reference text ({len(self._ref_text)} chars)") # Warm up with a short generation logger.info("Warming up F5-TTS...") warmup_path = self._cache_dir / "warmup_f5.wav" ok = await self._generate_f5tts("Hello, testing.", warmup_path) if warmup_path.exists(): warmup_path.unlink() if ok: self._f5tts_ready = True logger.info("F5-TTS warm-up complete - ready for streaming") else: logger.warning("F5-TTS warm-up failed") async def _init_xtts(self): """Initialize XTTS v2 model.""" loop = asyncio.get_event_loop() def load_model(): import torch logger.info("Loading XTTS v2 model...") model = TTS("tts_models/multilingual/multi-dataset/xtts_v2") if torch.cuda.is_available(): model = model.to(self._device) logger.info("XTTS v2 loaded on GPU") return model self._xtts_model = await loop.run_in_executor(None, load_model) self._xtts_ready = self._xtts_model is not None if self._xtts_ready: logger.info("Warming up XTTS with reference audio...") warmup_path = self._cache_dir / "warmup.wav" await self._generate_xtts("Hello.", warmup_path) if warmup_path.exists(): warmup_path.unlink() logger.info("XTTS warm-up complete") async def generate(self, text: str) -> Optional[str]: """Generate speech audio for the given text. Returns path to WAV file, or None on failure. All speech uses Farnsworth's cloned voice. """ if not text or not text.strip(): return None text = self._clean_text(text) if not text: return None # Check cache cache_path = self._get_cache_path(text) if cache_path.exists(): return str(cache_path) async with self._generation_lock: # Double-check cache after acquiring lock if cache_path.exists(): return str(cache_path) # Try F5-TTS first (fast voice cloning, 2-3s) if self._f5tts_ready: try: result = await asyncio.wait_for( self._generate_f5tts(text, cache_path), timeout=30.0, ) if result and cache_path.exists(): return str(cache_path) logger.warning("F5-TTS generation failed, trying XTTS fallback") except asyncio.TimeoutError: logger.warning("F5-TTS generation timed out (30s), trying XTTS") # Try XTTS v2 (slower voice cloning) if self._xtts_ready: try: result = await asyncio.wait_for( self._generate_xtts(text, cache_path), timeout=30.0, ) if result and cache_path.exists(): return str(cache_path) logger.warning("XTTS generation failed, falling back to Edge TTS") except asyncio.TimeoutError: logger.warning("XTTS generation timed out (30s), using Edge TTS") # Fallback: Edge TTS (no cloning but fast and reliable) if EDGE_TTS_AVAILABLE: result = await self._generate_edge(text, cache_path) if result: return str(cache_path) logger.error("All TTS providers failed") return None async def _generate_f5tts(self, text: str, output_path: Path) -> bool: """Generate with F5-TTS voice cloning.""" try: loop = asyncio.get_event_loop() def _run(): self._f5tts_model.infer( ref_file=self._reference_audio, ref_text=self._ref_text, gen_text=text, file_wave=str(output_path), show_info=logger.debug, nfe_step=32, speed=1.0, ) return True return await loop.run_in_executor(None, _run) except Exception as e: logger.error(f"F5-TTS error: {e}") return False async def _generate_xtts(self, text: str, output_path: Path) -> bool: """Generate with XTTS v2 voice cloning.""" try: loop = asyncio.get_event_loop() def _run(): self._xtts_model.tts_to_file( text=text, speaker_wav=self._reference_audio, language="en", file_path=str(output_path), ) return True return await loop.run_in_executor(None, _run) except Exception as e: logger.error(f"XTTS error: {e}") return False async def _generate_edge(self, text: str, output_path: Path) -> bool: """Generate with Edge TTS (fast fallback, no voice cloning).""" try: mp3_path = str(output_path).replace(".wav", ".mp3") tts = edge_tts.Communicate(text, voice=self._edge_voice) await tts.save(mp3_path) # Convert to WAV for stream compatibility subprocess.run( ["ffmpeg", "-y", "-i", mp3_path, "-ar", "24000", "-ac", "1", str(output_path)], capture_output=True, timeout=10, ) # Clean up MP3 try: os.unlink(mp3_path) except OSError: pass if output_path.exists(): return True # If ffmpeg conversion failed, use MP3 directly if os.path.exists(mp3_path): os.rename(mp3_path, str(output_path)) return True return False except Exception as e: logger.error(f"Edge TTS error: {e}") return False def _clean_text(self, text: str) -> str: """Clean text for TTS.""" import re # Remove markdown text = re.sub(r'\*\*([^*]+)\*\*', r'\1', text) text = re.sub(r'\*([^*]+)\*', r'\1', text) text = re.sub(r'`([^`]+)`', r'\1', text) text = re.sub(r'#{1,6}\s*', '', text) text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Remove emojis and special chars text = re.sub(r'[^\x00-\x7F]+', ' ', text) text = re.sub(r'[═─│┌┐└┘├┤┬┴┼]', '', text) text = re.sub(r'\s+', ' ', text).strip() return text def _get_cache_path(self, text: str) -> Path: """Get deterministic cache path for text.""" text_hash = hashlib.md5(f"farnsworth:{text}".encode()).hexdigest() return self._cache_dir / f"farnsworth_{text_hash}.wav" def get_audio_duration(self, audio_path: str) -> float: """Get duration of an audio file in seconds.""" try: import soundfile as sf data, sr = sf.read(audio_path) return len(data) / sr except Exception: try: import wave with wave.open(audio_path, 'r') as f: return f.getnframes() / f.getframerate() except Exception: return 5.0 # Default estimate async def cleanup(self): """Release GPU memory.""" self._f5tts_model = None self._f5tts_ready = False self._xtts_model = None self._xtts_ready = False try: import torch if torch.cuda.is_available(): torch.cuda.empty_cache() except ImportError: pass logger.info("VTuberTTS cleaned up")

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/timowhite88/Farnsworth'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

vtuber_tts.py•11.9 KiB