Farnsworth

Overview Schema Related Servers Score Discussions

neural_avatar.py•13.6 KiB

""" Neural Avatar - MuseTalk/StyleAvatar integration for photorealistic lip sync Provides cutting-edge neural rendering for the VTuber avatar """ import asyncio import numpy as np from dataclasses import dataclass from typing import Dict, List, Optional, Tuple, Any from collections import deque from pathlib import Path import time import os from loguru import logger try: import torch HAS_TORCH = True except ImportError: HAS_TORCH = False try: import cv2 HAS_CV2 = True except ImportError: HAS_CV2 = False @dataclass class NeuralAvatarConfig: """Configuration for neural avatar rendering""" # Model selection model_type: str = "musetalk" # musetalk, sadtalker, styleavatar # Face image face_image_path: Optional[str] = None face_crop_margin: float = 0.2 # Output settings output_width: int = 512 output_height: int = 512 output_fps: int = 30 # Quality settings use_half_precision: bool = True batch_size: int = 1 # MuseTalk specific musetalk_version: str = "v1.5" musetalk_bbox_shift: int = 0 # Device device: str = "cuda" if HAS_TORCH and torch.cuda.is_available() else "cpu" class MuseTalkAvatar: """ MuseTalk integration for real-time lip sync MuseTalk provides: - 30fps+ real-time lip sync - High quality face modification - Multilingual support - Single image to talking avatar """ def __init__(self, config: NeuralAvatarConfig): self.config = config self._model = None self._face_image = None self._face_mask = None self._initialized = False # Caching for performance self._latent_cache: Dict[str, Any] = {} self._max_cache_size = 50 self._last_audio_hash = None logger.info(f"MuseTalkAvatar initialized (device: {config.device})") def _cache_latent(self, key: str, value: Any): """Store a latent in the bounded cache, evicting oldest if full""" self._latent_cache[key] = value if len(self._latent_cache) > self._max_cache_size: # Remove oldest entry oldest = next(iter(self._latent_cache)) del self._latent_cache[oldest] async def initialize(self, face_image_path: Optional[str] = None) -> bool: """Initialize MuseTalk model and prepare face image""" if not HAS_TORCH: logger.error("PyTorch not available for neural avatar") return False if not HAS_CV2: logger.error("OpenCV not available for neural avatar") return False try: # Load face image image_path = face_image_path or self.config.face_image_path if not image_path or not os.path.exists(image_path): logger.error(f"Face image not found: {image_path}") return False self._face_image = cv2.imread(image_path) if self._face_image is None: logger.error(f"Failed to load face image: {image_path}") return False # Resize if needed self._face_image = cv2.resize( self._face_image, (self.config.output_width, self.config.output_height) ) # Try to load MuseTalk try: await self._load_musetalk_model() except Exception as e: logger.warning(f"MuseTalk model load failed: {e}") logger.info("Using simplified neural lip sync fallback") self._initialized = True logger.info("Neural avatar initialized successfully") return True except Exception as e: logger.error(f"Neural avatar initialization failed: {e}") return False async def _load_musetalk_model(self): """Load MuseTalk model weights""" # MuseTalk model loading # Note: Full implementation requires MuseTalk repository # https://github.com/TMElyralab/MuseTalk try: # Check if MuseTalk is installed from musetalk.models.musetalk import MuseTalk as MuseTalkModel from musetalk.utils.face_parsing import FaceParsing from musetalk.utils.audio import Audio2Feature # Initialize models self._audio_processor = Audio2Feature(device=self.config.device) self._face_parser = FaceParsing(device=self.config.device) self._model = MuseTalkModel.from_pretrained( "TMElyralab/MuseTalk", variant=self.config.musetalk_version ) self._model.to(self.config.device) if self.config.use_half_precision: self._model.half() self._model.eval() logger.info("MuseTalk model loaded successfully") except ImportError: logger.warning("MuseTalk not installed - using fallback") self._model = None async def generate_frame(self, audio_chunk: np.ndarray, sample_rate: int = 16000) -> Optional[np.ndarray]: """Generate a single frame with lip sync from audio chunk""" if not self._initialized or self._face_image is None: return None if self._model is not None: return await self._generate_musetalk_frame(audio_chunk, sample_rate) else: return await self._generate_fallback_frame(audio_chunk, sample_rate) async def _generate_musetalk_frame(self, audio_chunk: np.ndarray, sample_rate: int) -> Optional[np.ndarray]: """Generate frame using MuseTalk""" try: with torch.no_grad(): # Process audio to features audio_tensor = torch.from_numpy(audio_chunk).float() audio_features = self._audio_processor(audio_tensor, sample_rate) # Get face crop and mask if self._face_mask is None: face_tensor = torch.from_numpy(self._face_image).permute(2, 0, 1).float() self._face_mask = self._face_parser(face_tensor.unsqueeze(0)) # Generate lip sync face_tensor = torch.from_numpy(self._face_image).permute(2, 0, 1).float() face_tensor = face_tensor.unsqueeze(0).to(self.config.device) if self.config.use_half_precision: face_tensor = face_tensor.half() audio_features = audio_features.half() # Forward pass output = self._model( face_tensor, audio_features, self._face_mask ) # Convert to numpy frame = output.squeeze(0).permute(1, 2, 0).cpu().numpy() frame = (frame * 255).astype(np.uint8) return frame except Exception as e: logger.error(f"MuseTalk generation failed: {e}") return await self._generate_fallback_frame(audio_chunk, sample_rate) async def _generate_fallback_frame(self, audio_chunk: np.ndarray, sample_rate: int) -> Optional[np.ndarray]: """Fallback frame generation with simple mouth animation""" if self._face_image is None: return None frame = self._face_image.copy() # Calculate audio intensity rms = np.sqrt(np.mean(audio_chunk.astype(float) ** 2)) intensity = min(rms / 5000, 1.0) if len(audio_chunk) > 0 else 0.0 # Simple mouth overlay based on intensity if intensity > 0.1: # Draw mouth opening (simplified) center_x = self.config.output_width // 2 center_y = int(self.config.output_height * 0.7) mouth_height = int(10 + intensity * 30) cv2.ellipse( frame, (center_x, center_y), (40, mouth_height), 0, 0, 360, (50, 30, 30), -1 ) return frame async def generate_video_segment(self, audio: np.ndarray, sample_rate: int = 16000, chunk_size: int = 1600) -> List[np.ndarray]: """Generate multiple frames for an audio segment""" frames = [] num_chunks = len(audio) // chunk_size for i in range(num_chunks): start = i * chunk_size end = start + chunk_size chunk = audio[start:end] frame = await self.generate_frame(chunk, sample_rate) if frame is not None: frames.append(frame) return frames def get_latent_representation(self) -> Optional[np.ndarray]: """Get latent representation of face for caching""" if self._model is None or self._face_image is None: return None try: with torch.no_grad(): face_tensor = torch.from_numpy(self._face_image).permute(2, 0, 1).float() face_tensor = face_tensor.unsqueeze(0).to(self.config.device) # Get encoder output latent = self._model.encode(face_tensor) return latent.cpu().numpy() except Exception as e: logger.error(f"Failed to get latent: {e}") return None async def cleanup(self): """Release GPU memory""" if self._model is not None: del self._model self._model = None if HAS_TORCH: torch.cuda.empty_cache() self._face_image = None self._face_mask = None self._latent_cache.clear() self._initialized = False class SadTalkerAvatar: """ SadTalker integration for 3D-aware lip sync SadTalker provides: - 3DMM-based facial motion - Audio-driven 3D motion coefficients - Single image to video """ def __init__(self, config: NeuralAvatarConfig): self.config = config self._model = None self._face_image = None self._initialized = False logger.info("SadTalkerAvatar initialized") async def initialize(self, face_image_path: Optional[str] = None) -> bool: """Initialize SadTalker""" # SadTalker implementation # https://github.com/OpenTalker/SadTalker try: from sadtalker.src.facerender.animate import AnimateFromCoeff from sadtalker.src.generate_batch import get_data from sadtalker.src.generate_facerender_batch import get_facerender_data # Load models... logger.info("SadTalker loaded") self._initialized = True return True except ImportError: logger.warning("SadTalker not installed") return False async def generate_frame(self, audio_chunk: np.ndarray, sample_rate: int = 16000) -> Optional[np.ndarray]: """Generate frame using SadTalker""" # SadTalker is typically used for batch video generation # Real-time use requires specific optimization return None class NeuralAvatarManager: """ Manager for neural avatar backends Handles: - Backend selection and fallback - Performance optimization - Frame caching """ def __init__(self, config: NeuralAvatarConfig): self.config = config self._active_backend: Optional[Any] = None self._backend_type: str = "" # Frame buffer for smooth playback self._frame_buffer: deque = deque(maxlen=10) self._buffer_size = 10 async def initialize(self, face_image_path: str) -> bool: """Initialize the best available neural avatar backend""" # Try backends in order of preference backends = [ ("musetalk", MuseTalkAvatar), ("sadtalker", SadTalkerAvatar), ] for backend_name, backend_class in backends: if self.config.model_type and self.config.model_type != backend_name: continue try: backend = backend_class(self.config) if await backend.initialize(face_image_path): self._active_backend = backend self._backend_type = backend_name logger.info(f"Using neural avatar backend: {backend_name}") return True except Exception as e: logger.warning(f"Backend {backend_name} failed: {e}") logger.error("No neural avatar backend available") return False async def generate_frame(self, audio_chunk: np.ndarray, sample_rate: int = 16000) -> Optional[np.ndarray]: """Generate a frame using the active backend""" if self._active_backend is None: return None frame = await self._active_backend.generate_frame(audio_chunk, sample_rate) # Add to buffer (deque handles maxlen automatically) if frame is not None: self._frame_buffer.append(frame) return frame def get_buffered_frame(self, index: int = -1) -> Optional[np.ndarray]: """Get a frame from the buffer""" if not self._frame_buffer: return None return self._frame_buffer[index] @property def backend_type(self) -> str: return self._backend_type @property def is_initialized(self) -> bool: return self._active_backend is not None async def cleanup(self): """Clean up the active backend""" if self._active_backend and hasattr(self._active_backend, 'cleanup'): await self._active_backend.cleanup() self._active_backend = None self._frame_buffer.clear()

Loading blob content...

Latest Blog Posts

Redis vs ioredis vs valkey-glide
By punkpeye on January 26, 2026.
benchmark
Redis
valkey
Quickstart: Publish an MCP Server to the MCP Registry
By punkpeye on January 24, 2026.
mcp
official reference mirror
Official MCP Registry Server.json Requirements
By punkpeye on January 24, 2026.
mcp
official reference mirror

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/timowhite88/Farnsworth'

If you have feedback or need assistance with the MCP directory API, please join our Discord server

neural_avatar.py•13.6 KiB