Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	| ο»Ώimport torch | |
| import tempfile | |
| import logging | |
| import soundfile as sf | |
| import numpy as np | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| import asyncio | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| class HuggingFaceTTSClient: | |
| """ | |
| Hugging Face TTS client using Microsoft SpeechT5 | |
| Fixed to avoid dataset script issues | |
| """ | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.processor = None | |
| self.model = None | |
| self.vocoder = None | |
| self.speaker_embeddings = None | |
| self.model_loaded = False | |
| logger.info(f"HF TTS Client initialized on device: {self.device}") | |
| async def load_model(self): | |
| """Load SpeechT5 model and vocoder with fixed speaker embeddings""" | |
| try: | |
| logger.info("Loading SpeechT5 TTS model...") | |
| # Load processor, model and vocoder | |
| self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device) | |
| self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device) | |
| # Use a pre-defined speaker embedding instead of loading from dataset | |
| # This avoids the dataset script issue | |
| self.speaker_embeddings = self._get_default_speaker_embedding() | |
| self.model_loaded = True | |
| logger.info("β SpeechT5 TTS model loaded successfully") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to load TTS model: {e}") | |
| return False | |
| def _get_default_speaker_embedding(self): | |
| """Get default speaker embedding to avoid dataset loading issues""" | |
| # Create a default speaker embedding vector (512 dimensions for SpeechT5) | |
| # This is based on the expected embedding size for SpeechT5 | |
| embedding = torch.randn(1, 512).to(self.device) | |
| return embedding | |
| def _get_speaker_embedding(self, voice_id: Optional[str]): | |
| """Get speaker embedding based on voice_id""" | |
| # Create different embeddings for different voices by seeding the random generator | |
| voice_seeds = { | |
| "21m00Tcm4TlvDq8ikWAM": 42, # Female voice (default) | |
| "pNInz6obpgDQGcFmaJgB": 123, # Male voice | |
| "EXAVITQu4vr4xnSDxMaL": 456, # Sweet female | |
| "ErXwobaYiN019PkySvjV": 789, # Professional male | |
| "TxGEqnHWrfWFTfGW9XjX": 101, # Deep male | |
| "yoZ06aMxZJJ28mfd3POQ": 202, # Friendly | |
| "AZnzlk1XvdvUeBnXmlld": 303, # Strong female | |
| } | |
| seed = voice_seeds.get(voice_id, 42) # Default to female voice | |
| # Create deterministic embedding based on seed | |
| generator = torch.Generator(device=self.device) | |
| generator.manual_seed(seed) | |
| embedding = torch.randn(1, 512, generator=generator, device=self.device) | |
| return embedding | |
| async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str: | |
| """ | |
| Convert text to speech using SpeechT5 | |
| Args: | |
| text: Text to convert to speech | |
| voice_id: Voice identifier (mapped to different speaker embeddings) | |
| Returns: | |
| Path to generated audio file | |
| """ | |
| if not self.model_loaded: | |
| logger.info("Model not loaded, loading now...") | |
| success = await self.load_model() | |
| if not success: | |
| raise Exception("Failed to load TTS model") | |
| try: | |
| logger.info(f"Generating speech for text: {text[:50]}...") | |
| # Get speaker embedding for the requested voice | |
| speaker_embeddings = self._get_speaker_embedding(voice_id) | |
| # Process text | |
| inputs = self.processor(text=text, return_tensors="pt").to(self.device) | |
| # Generate speech | |
| with torch.no_grad(): | |
| speech = self.model.generate_speech( | |
| inputs["input_ids"], | |
| speaker_embeddings, | |
| vocoder=self.vocoder | |
| ) | |
| # Convert to audio file | |
| audio_data = speech.cpu().numpy() | |
| # Save to temporary file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
| sf.write(temp_file.name, audio_data, samplerate=16000) | |
| temp_file.close() | |
| logger.info(f"β Generated speech audio: {temp_file.name}") | |
| return temp_file.name | |
| except Exception as e: | |
| logger.error(f"β Error generating speech: {e}") | |
| raise Exception(f"TTS generation failed: {e}") | |
