Spaces:
Running
Running
| ο»Ώimport torch | |
| import tempfile | |
| import logging | |
| import soundfile as sf | |
| import numpy as np | |
| from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
| import asyncio | |
| from typing import Optional | |
| logger = logging.getLogger(__name__) | |
| class MinimalTTSClient: | |
| """ | |
| Minimal TTS client with basic functionality | |
| Uses only core transformers without complex dependencies | |
| """ | |
| def __init__(self): | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| self.model_loaded = False | |
| logger.info(f"Minimal TTS Client initialized on device: {self.device}") | |
| async def load_model(self): | |
| """Load a simple TTS model or create mock audio""" | |
| try: | |
| logger.info("Setting up minimal TTS...") | |
| # For now, we'll create a mock TTS that generates simple audio | |
| # This avoids all the complex model loading issues | |
| self.model_loaded = True | |
| logger.info("β Minimal TTS ready") | |
| return True | |
| except Exception as e: | |
| logger.error(f"β Failed to load TTS: {e}") | |
| return False | |
| async def text_to_speech(self, text: str, voice_id: Optional[str] = None) -> str: | |
| """ | |
| Convert text to speech - for now creates a simple audio file | |
| """ | |
| if not self.model_loaded: | |
| logger.info("TTS not loaded, loading now...") | |
| success = await self.load_model() | |
| if not success: | |
| raise Exception("Failed to load TTS") | |
| try: | |
| logger.info(f"Generating minimal audio for text: {text[:50]}...") | |
| # Create a simple tone/beep as placeholder audio | |
| # This ensures the system works while we debug TTS issues | |
| duration = min(len(text) * 0.1, 10.0) # Max 10 seconds | |
| sample_rate = 16000 | |
| t = np.linspace(0, duration, int(sample_rate * duration), False) | |
| # Create a simple tone that varies based on text length | |
| frequency = 440 + (len(text) % 100) * 2 # Vary frequency slightly | |
| audio_data = 0.1 * np.sin(2 * np.pi * frequency * t) | |
| # Add some variation to make it less monotonous | |
| audio_data = audio_data * (1 + 0.3 * np.sin(2 * np.pi * 2 * t)) | |
| # Save to temporary file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') | |
| sf.write(temp_file.name, audio_data, samplerate=sample_rate) | |
| temp_file.close() | |
| logger.info(f"β Generated placeholder audio: {temp_file.name}") | |
| logger.warning("π’ Using placeholder audio - TTS will be improved in next update") | |
| return temp_file.name | |
| except Exception as e: | |
| logger.error(f"β Error generating audio: {e}") | |
| raise Exception(f"Audio generation failed: {e}") | |