AI_Avatar_Chat / hf_tts_client.py
bravedims
🚨 CRITICAL FIX: Remove all Unicode characters causing Python syntax errors
05d082e
raw
history blame
5.02 kB
import torch
import tempfile
import logging
import soundfile as sf
import numpy as np
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import asyncio
from typing import Optional
logger = logging.getLogger(__name__)
class HuggingFaceTTSClient:
"""
Hugging Face TTS client using Microsoft SpeechT5
Fixed to avoid dataset script issues
"""
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.processor = None
self.model = None
self.vocoder = None
self.speaker_embeddings = None
self.model_loaded = False
logger.info(f"HF TTS Client initialized on device: {self.device}")
async def load_model(self):
"""Load SpeechT5 model and vocoder with fixed speaker embeddings"""
try:
logger.info("Loading SpeechT5 TTS model...")
# Load processor, model and vocoder
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
# Use a pre-defined speaker embedding instead of loading from dataset
# This avoids the dataset script issue
self.speaker_embeddings = self._get_default_speaker_embedding()
self.model_loaded = True
logger.info("SUCCESS: SpeechT5 TTS model loaded successfully")
return True
except Exception as e:
logger.error(f"ERROR: Failed to load TTS model: {e}")
return False
def _get_default_speaker_embedding(self):
"""Get default speaker embedding to avoid dataset loading issues"""
# Create a default speaker embedding vector (512 dimensions for SpeechT5)
# This is based on the expected embedding size for SpeechT5
embedding = torch.randn(1, 512).to(self.device)
return embedding
def _get_speaker_embedding(self, voice_id: Optional[str]):
"""Get speaker embedding based on voice_id"""
# Create different embeddings for different voices by seeding the random generator
voice_seeds = {
"21m00Tcm4TlvDq8ikWAM": 42, # Female voice (default)
"pNInz6obpgDQGcFmaJgB": 123, # Male voice
"EXAVITQu4vr4xnSDxMaL": 456, # Sweet female
"ErXwobaYiN019PkySvjV": 789, # Professional male
"TxGEqnHWrfWFTfGW9XjX": 101, # Deep male
"yoZ06aMxZJJ28mfd3POQ": 202, # Friendly
"AZnzlk1XvdvUeBnXmlld": 303, # Strong female
}
seed = voice_seeds.get(voice_id, 42) # Default to female voice
# Create deterministic embedding based on seed
generator = torch.Generator(device=self.device)
generator.manual_seed(seed)
embedding = torch.randn(1, 512, generator=generator, device=self.device)
return embedding
async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str:
"""
Convert text to speech using SpeechT5
Args:
text: Text to convert to speech
voice_id: Voice identifier (mapped to different speaker embeddings)
Returns:
Path to generated audio file
"""
if not self.model_loaded:
logger.info("Model not loaded, loading now...")
success = await self.load_model()
if not success:
raise Exception("Failed to load TTS model")
try:
logger.info(f"Generating speech for text: {text[:50]}...")
# Get speaker embedding for the requested voice
speaker_embeddings = self._get_speaker_embedding(voice_id)
# Process text
inputs = self.processor(text=text, return_tensors="pt").to(self.device)
# Generate speech
with torch.no_grad():
speech = self.model.generate_speech(
inputs["input_ids"],
speaker_embeddings,
vocoder=self.vocoder
)
# Convert to audio file
audio_data = speech.cpu().numpy()
# Save to temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
sf.write(temp_file.name, audio_data, samplerate=16000)
temp_file.close()
logger.info(f"SUCCESS: Generated speech audio: {temp_file.name}")
return temp_file.name
except Exception as e:
logger.error(f"ERROR: Error generating speech: {e}")
raise Exception(f"TTS generation failed: {e}")