import torch import tempfile import logging import soundfile as sf import numpy as np from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan import asyncio from typing import Optional logger = logging.getLogger(__name__) class HuggingFaceTTSClient: """ Hugging Face TTS client using Microsoft SpeechT5 Fixed to avoid dataset script issues """ def __init__(self): self.device = "cuda" if torch.cuda.is_available() else "cpu" self.processor = None self.model = None self.vocoder = None self.speaker_embeddings = None self.model_loaded = False logger.info(f"HF TTS Client initialized on device: {self.device}") async def load_model(self): """Load SpeechT5 model and vocoder with fixed speaker embeddings""" try: logger.info("Loading SpeechT5 TTS model...") # Load processor, model and vocoder self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device) self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device) # Use a pre-defined speaker embedding instead of loading from dataset # This avoids the dataset script issue self.speaker_embeddings = self._get_default_speaker_embedding() self.model_loaded = True logger.info("✅ SpeechT5 TTS model loaded successfully") return True except Exception as e: logger.error(f"❌ Failed to load TTS model: {e}") return False def _get_default_speaker_embedding(self): """Get default speaker embedding to avoid dataset loading issues""" # Create a default speaker embedding vector (512 dimensions for SpeechT5) # This is based on the expected embedding size for SpeechT5 embedding = torch.randn(1, 512).to(self.device) return embedding def _get_speaker_embedding(self, voice_id: Optional[str]): """Get speaker embedding based on voice_id""" # Create different embeddings for different voices by seeding the random generator voice_seeds = { "21m00Tcm4TlvDq8ikWAM": 42, # Female voice (default) "pNInz6obpgDQGcFmaJgB": 123, # Male voice "EXAVITQu4vr4xnSDxMaL": 456, # Sweet female "ErXwobaYiN019PkySvjV": 789, # Professional male "TxGEqnHWrfWFTfGW9XjX": 101, # Deep male "yoZ06aMxZJJ28mfd3POQ": 202, # Friendly "AZnzlk1XvdvUeBnXmlld": 303, # Strong female } seed = voice_seeds.get(voice_id, 42) # Default to female voice # Create deterministic embedding based on seed generator = torch.Generator(device=self.device) generator.manual_seed(seed) embedding = torch.randn(1, 512, generator=generator, device=self.device) return embedding async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str: """ Convert text to speech using SpeechT5 Args: text: Text to convert to speech voice_id: Voice identifier (mapped to different speaker embeddings) Returns: Path to generated audio file """ if not self.model_loaded: logger.info("Model not loaded, loading now...") success = await self.load_model() if not success: raise Exception("Failed to load TTS model") try: logger.info(f"Generating speech for text: {text[:50]}...") # Get speaker embedding for the requested voice speaker_embeddings = self._get_speaker_embedding(voice_id) # Process text inputs = self.processor(text=text, return_tensors="pt").to(self.device) # Generate speech with torch.no_grad(): speech = self.model.generate_speech( inputs["input_ids"], speaker_embeddings, vocoder=self.vocoder ) # Convert to audio file audio_data = speech.cpu().numpy() # Save to temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') sf.write(temp_file.name, audio_data, samplerate=16000) temp_file.close() logger.info(f"✅ Generated speech audio: {temp_file.name}") return temp_file.name except Exception as e: logger.error(f"❌ Error generating speech: {e}") raise Exception(f"TTS generation failed: {e}")