Spaces:
Running
Running
| ο»Ώimport torch | |
| import tempfile | |
| import logging | |
| import soundfile as sf | |
| import numpy as np | |
| import asyncio | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| class RobustTTSClient: | |
| """ | |
| Robust TTS client that always works - generates placeholder audio tones | |
| No external dependencies that can fail | |
| """ | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model_loaded = False | |
| logger.info(f"Robust TTS Client initialized on device: {self.device}") | |
| async def load_model(self): | |
| """Always succeeds - no actual model loading""" | |
| try: | |
| logger.info("Setting up robust placeholder TTS...") | |
| self.model_loaded = True | |
| logger.info("β Robust TTS ready (placeholder audio mode)") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Unexpected error in TTS setup: {e}") | |
| # Even if something goes wrong, we can still generate audio | |
| self.model_loaded = True | |
| return True | |
| def generate_tone_audio(self, text: str, voice_id: Optional[str] = None) -> str: | |
| """Generate audio tone based on text content - always works""" | |
| try: | |
| # Calculate duration based on text length | |
| duration = max(2.0, min(len(text) * 0.08, 15.0)) # 0.08s per character, max 15s | |
| sample_rate = 22050 # Standard audio sample rate | |
| # Generate time array | |
| t = np.linspace(0, duration, int(sample_rate * duration), False) | |
| # Create varied tones based on text and voice_id | |
| base_freq = 440 # A4 note | |
| # Vary frequency based on voice_id (different "voices") | |
| voice_multipliers = { | |
| "21m00Tcm4TlvDq8ikWAM": 1.0, # Female (higher) | |
| "pNInz6obpgDQGcFmaJgB": 0.75, # Male (lower) | |
| "EXAVITQu4vr4xnSDxMaL": 1.1, # Sweet female | |
| "ErXwobaYiN019PkySvjV": 0.8, # Professional male | |
| "TxGEqnHWrfWFTfGW9XjX": 0.65, # Deep male | |
| "yoZ06aMxZJJ28mfd3POQ": 0.9, # Friendly | |
| "AZnzlk1XvdvUeBnXmlld": 1.05, # Strong female | |
| } | |
| freq_multiplier = voice_multipliers.get(voice_id, 1.0) | |
| frequency = base_freq * freq_multiplier | |
| # Generate primary tone | |
| audio_data = 0.3 * np.sin(2 * np.pi * frequency * t) | |
| # Add harmonics for more natural sound | |
| audio_data += 0.15 * np.sin(2 * np.pi * frequency * 2 * t) # Octave | |
| audio_data += 0.1 * np.sin(2 * np.pi * frequency * 3 * t) # Fifth | |
| # Add text-based variation (different words create different patterns) | |
| text_hash = abs(hash(text.lower())) % 1000 | |
| variation_freq = 50 + (text_hash % 200) # 50-250 Hz variation | |
| audio_data += 0.05 * np.sin(2 * np.pi * variation_freq * t) | |
| # Add amplitude envelope (fade in/out) | |
| fade_samples = int(0.1 * sample_rate) # 0.1 second fade | |
| if len(audio_data) > 2 * fade_samples: | |
| # Fade in | |
| audio_data[:fade_samples] *= np.linspace(0, 1, fade_samples) | |
| # Fade out | |
| audio_data[-fade_samples:] *= np.linspace(1, 0, fade_samples) | |
| # Normalize audio | |
| audio_data = audio_data / np.max(np.abs(audio_data)) | |
| return audio_data, sample_rate | |
| except Exception as e: | |
| logger.error(f"Error in tone generation: {e}") | |
| # Fallback to simple beep | |
| duration = 2.0 | |
| sample_rate = 22050 | |
| t = np.linspace(0, duration, int(sample_rate * duration), False) | |
| audio_data = 0.3 * np.sin(2 * np.pi * 440 * t) | |
| return audio_data, sample_rate | |
| async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str: | |
| """ | |
| Convert text to speech - generates placeholder audio that always works | |
| """ | |
| if not self.model_loaded: | |
| logger.info("TTS not loaded, loading now...") | |
| success = await self.load_model() | |
| if not success: | |
| logger.error("TTS loading failed, but continuing with basic audio") | |
| try: | |
| logger.info(f"Generating audio for text: {text[:50]}...") | |
| logger.info(f"Using voice profile: {voice_id or 'default'}") | |
| # Generate audio data | |
| audio_data, sample_rate = self.generate_tone_audio(text, voice_id) | |
| # Save to temporary file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
| sf.write(temp_file.name, audio_data, samplerate=sample_rate) | |
| temp_file.close() | |
| logger.info(f"β Generated audio file: {temp_file.name}") | |
| logger.info(f"π Audio details: {len(audio_data)/sample_rate:.1f}s, {sample_rate}Hz") | |
| logger.warning("π Using placeholder audio - Real TTS coming in future update") | |
| return temp_file.name | |
| except Exception as e: | |
| logger.error(f"β Critical error in audio generation: {str(e)}") | |
| logger.error(f"Exception type: {type(e).__name__}") | |
| # Last resort: create minimal audio file | |
| try: | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
| # Create 2 seconds of simple sine wave | |
| sample_rate = 22050 | |
| duration = 2.0 | |
| t = np.linspace(0, duration, int(sample_rate * duration), False) | |
| audio_data = 0.3 * np.sin(2 * np.pi * 440 * t) | |
| sf.write(temp_file.name, audio_data, samplerate=sample_rate) | |
| temp_file.close() | |
| logger.info(f"β Created fallback audio: {temp_file.name}") | |
| return temp_file.name | |
| except Exception as final_error: | |
| logger.error(f"β Even fallback audio failed: {final_error}") | |
| raise Exception(f"Complete TTS failure: {final_error}") | |