Spaces:
Running
Running
File size: 4,701 Bytes
8be8b4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import torch
import tempfile
import logging
import soundfile as sf
import numpy as np
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import asyncio
from typing import Optional
logger = logging.getLogger(__name__)
class HuggingFaceTTSClient:
"""
Hugging Face TTS client using Microsoft SpeechT5
Replaces ElevenLabs with free, open-source TTS
"""
def __init__(self):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.processor = None
self.model = None
self.vocoder = None
self.speaker_embeddings = None
self.model_loaded = False
logger.info(f"HF TTS Client initialized on device: {self.device}")
async def load_model(self):
"""Load SpeechT5 model and vocoder"""
try:
logger.info("Loading SpeechT5 TTS model...")
# Load processor, model and vocoder
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
self.model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
# Load speaker embeddings dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
self.speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(self.device)
self.model_loaded = True
logger.info("✅ SpeechT5 TTS model loaded successfully")
return True
except Exception as e:
logger.error(f"❌ Failed to load TTS model: {e}")
return False
async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str:
"""
Convert text to speech using SpeechT5
Args:
text: Text to convert to speech
voice_id: Voice identifier (for compatibility, maps to speaker embeddings)
Returns:
Path to generated audio file
"""
if not self.model_loaded:
logger.info("Model not loaded, loading now...")
success = await self.load_model()
if not success:
raise Exception("Failed to load TTS model")
try:
logger.info(f"Generating speech for text: {text[:50]}...")
# Choose speaker embedding based on voice_id (for variety)
speaker_idx = self._get_speaker_index(voice_id)
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[speaker_idx]["xvector"]).unsqueeze(0).to(self.device)
# Process text
inputs = self.processor(text=text, return_tensors="pt").to(self.device)
# Generate speech
with torch.no_grad():
speech = self.model.generate_speech(
inputs["input_ids"],
speaker_embeddings,
vocoder=self.vocoder
)
# Convert to audio file
audio_data = speech.cpu().numpy()
# Save to temporary file
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
sf.write(temp_file.name, audio_data, samplerate=16000)
temp_file.close()
logger.info(f"✅ Generated speech audio: {temp_file.name}")
return temp_file.name
except Exception as e:
logger.error(f"❌ Error generating speech: {e}")
raise Exception(f"TTS generation failed: {e}")
def _get_speaker_index(self, voice_id: Optional[str]) -> int:
"""Map voice_id to speaker embedding index for voice variety"""
voice_mapping = {
# Map ElevenLabs voice IDs to speaker indices for compatibility
"21m00Tcm4TlvDq8ikWAM": 7306, # Female voice (default)
"pNInz6obpgDQGcFmaJgB": 4077, # Male voice
"EXAVITQu4vr4xnSDxMaL": 1995, # Female voice (sweet)
"ErXwobaYiN019PkySvjV": 8051, # Male voice (professional)
"TxGEqnHWrfWFTfGW9XjX": 5688, # Deep male voice
"yoZ06aMxZJJ28mfd3POQ": 3570, # Friendly voice
"AZnzlk1XvdvUeBnXmlld": 2967, # Strong female
}
return voice_mapping.get(voice_id, 7306) # Default to female voice
|