Spaces:

NihalGazi
/

Text-To-Speech-Unlimited

Running

App Files Files Community

diego2554 commited on Sep 21

Commit

1a3d72c

verified ·

1 Parent(s): 9c33716

Update app.py

Browse files

Enhancements added:

1. **Emotion Categories & Submenus**: Emotions are now organized into "Common" and "Complex" categories for easier selection.
2. **Dropdown + Textbox Sync**: Selecting an emotion from the dropdown automatically updates the emotion textbox; users can still type custom emotions.
3. **Temporary File Cleanup**: Generated audio files are automatically cleaned up on exit.
4. **Retry Logic**: Audio generation includes up to 3 retries in case of network/API issues.
5. **NSFW Check (Optional)**: Functionality included but commented out; can be activated if needed.
6. **Improved UX**: More intuitive and robust interface for Hugging Face Spaces with categories and pre-defined complex emotions.

Files changed (1) hide show

app.py +91 -123

app.py CHANGED Viewed

@@ -5,164 +5,145 @@ import random
 import urllib.parse
 import tempfile
 import os
 NSFW_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
 TTS_URL_TEMPLATE = os.getenv("TTS_API_URL_TEMPLATE")
 if not NSFW_URL_TEMPLATE:
     raise ValueError("Missing Secret: NSFW_API_URL_TEMPLATE is not set in Hugging Face Space secrets.")
 if not TTS_URL_TEMPLATE:
     raise ValueError("Missing Secret: TTS_API_URL_TEMPLATE is not set in Hugging Face Space secrets.")
 # VOICES
 VOICES = [
-    "alloy", "echo", "fable", "onyx", "nova", "shimmer",  # Standard OpenAI Voices
-    "coral", "verse", "ballad", "ash", "sage", "amuch", "dan" # Some additional pre-trained
 ]
 def check_nsfw(prompt: str) -> bool:
-    global NSFW_URL_TEMPLATE
     try:
         encoded_prompt = urllib.parse.quote(prompt)
         url = NSFW_URL_TEMPLATE.format(prompt=encoded_prompt)
-        print(f"DEBUG: Checking NSFW URL: {url.split('?')[0]}... (query params hidden)")
         response = requests.get(url, timeout=20)
         response.raise_for_status()
         result = response.text.strip().upper()
-        print(f"DEBUG: NSFW Check Response: '{result}'")
-        if result == "YES":
-            return True
-        elif result == "NO":
-            return False
-        else:
-            print(f"Warning: Unexpected response from NSFW checker: {response.text}")
-            return True # unexpected responses = potentially NSFW
     except requests.exceptions.RequestException as e:
         print(f"Error during NSFW check: {e}")
-        raise gr.Error(f"Failed to check prompt safety.")
     except Exception as e:
         print(f"Unexpected error during NSFW check: {e}")
-        raise gr.Error(f"An unexpected error occurred during safety check. Please wait for a second and try again.")
 def generate_audio(prompt: str, voice: str, emotion: str, seed: int) -> bytes:
-   # Generates audio using the API from server
-    global TTS_URL_TEMPLATE
-    try:
-        encoded_prompt = urllib.parse.quote(prompt)
-        encoded_emotion = urllib.parse.quote(emotion)
-        url = TTS_URL_TEMPLATE.format(
-            prompt=encoded_prompt,
-            emotion=encoded_emotion,
-            voice=voice,
-            seed=seed
-        )
-        print(f"DEBUG: Generating Audio URL: {url.split('?')[0]}... (query params hidden)")
-        response = requests.get(url, timeout=60)
-        response.raise_for_status()
-        content_type = response.headers.get('content-type', '').lower()
-        if 'audio' not in content_type:
-            print(f"Warning: Unexpected content type received: {content_type}")
-            print(f"Response Text: {response.text[:500]}")
-            raise gr.Error(f"API did not return audio.")
-        return response.content
-    except requests.exceptions.RequestException as e:
-        print(f"Error during audio generation: {e}")
-        error_details = ""
-        if hasattr(e, 'response') and e.response is not None:
-            error_details = e.response.text[:200]
-        raise gr.Error(f"Failed to generate audio. Please wait for a second and try again.")
-    except Exception as e:
-        print(f"Unexpected error during audio generation: {e}")
-        raise gr.Error(f"An unexpected error occurred during audio generation. Please wait for a second and try again.")
 def text_to_speech_app(prompt: str, voice: str, emotion: str, use_random_seed: bool, specific_seed: int):
-    print("\n\n\n"+prompt+"\n\n\n")
     if not prompt:
         raise gr.Error("Prompt cannot be empty.")
     if not emotion:
         emotion = "neutral"
-        print("Warning: No emotion provided, defaulting to 'neutral'.")
     if not voice:
-         raise gr.Error("Please select a voice.")
     seed = random.randint(0, 2**32 - 1) if use_random_seed else int(specific_seed)
-    print(f"Using Seed: {seed}")
-    # check NSFW
-    print("Checking prompt safety...")
-    try:
-        # is_nsfw = check_nsfw(prompt)
-        is_nsfw = False
-    except gr.Error as e:
-        return None, f"There was an error. Please wait for a second and try again."
     if is_nsfw:
-        print("Prompt flagged as inappropriate.")
         return None, "Error: The prompt was flagged as inappropriate and cannot be processed."
-    # if not nsfw
-    print("Prompt is safe. Generating audio...")
     try:
         audio_bytes = generate_audio(prompt, voice, emotion, seed)
-        # audio save to a temporary file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
             temp_audio_file.write(audio_bytes)
             temp_file_path = temp_audio_file.name
-            print(f"Audio saved temporarily to: {temp_file_path}")
         return temp_file_path, f"Audio generated successfully with voice '{voice}', emotion '{emotion}', and seed {seed}."
     except gr.Error as e:
-         return None, str(e)
     except Exception as e:
-        print(f"Unexpected error in main function: {e}")
-        return None, f"An unexpected error occurred: {e}"
 def toggle_seed_input(use_random_seed):
     return gr.update(visible=not use_random_seed, value=12345)
 with gr.Blocks() as app:
     gr.Markdown("# Advanced OpenAI Text-To-Speech Unlimited")
-    gr.Markdown(
-        """Enter text, choose a voice and emotion, and generate audio.
-        The text will be checked for appropriateness before generation.
-        Use it as much as you want.
-        **Like & follow** for more AI projects:
-        • Instagram: [@nihal_gazi_io](https://www.instagram.com/nihal_gazi_io/)
-        • X.com: [@NihalGazi_](https://x.com/NihalGazi_?t=f9UtAv005GppiIIXFEWMSQ&s=09)
-        • Discord: nihal_gazi_io"""
-    )
     with gr.Row():
         with gr.Column(scale=2):
-            prompt_input = gr.Textbox(label="Prompt", placeholder="Enter the text you want to convert to speech...")
-            emotion_input = gr.Textbox(label="Emotion Style", placeholder="e.g., happy, sad, excited, calm...")
-            voice_dropdown = gr.Dropdown(label="Voice", choices=VOICES, value="alloy")
         with gr.Column(scale=1):
             random_seed_checkbox = gr.Checkbox(label="Use Random Seed", value=True)
             seed_input = gr.Number(label="Specific Seed", value=12345, visible=False, precision=0)
@@ -172,27 +153,17 @@ with gr.Blocks() as app:
         audio_output = gr.Audio(label="Generated Audio", type="filepath")
         status_output = gr.Textbox(label="Status")
-    random_seed_checkbox.change(
-        fn=toggle_seed_input,
-        inputs=[random_seed_checkbox],
-        outputs=[seed_input]
-    )
     submit_button.click(
         fn=text_to_speech_app,
-        inputs=[
-            prompt_input,
-            voice_dropdown,
-            emotion_input,
-            random_seed_checkbox,
-            seed_input
-        ],
         outputs=[audio_output, status_output],
         concurrency_limit=30
     )
     gr.Examples(
         examples=[
             ["Hello there! This is a test of the text-to-speech system.", "alloy", "neutral", False, 12345],
@@ -203,14 +174,11 @@ with gr.Blocks() as app:
         inputs=[prompt_input, voice_dropdown, emotion_input, random_seed_checkbox, seed_input],
         outputs=[audio_output, status_output],
         fn=text_to_speech_app,
-        cache_examples=False,
     )
 if __name__ == "__main__":
     if NSFW_URL_TEMPLATE and TTS_URL_TEMPLATE:
         app.launch()
     else:
         print("ERROR: Cannot launch app. Required API URL secrets are missing.")

 import urllib.parse
 import tempfile
 import os
+import atexit
+import time
 NSFW_URL_TEMPLATE = os.getenv("NSFW_API_URL_TEMPLATE")
 TTS_URL_TEMPLATE = os.getenv("TTS_API_URL_TEMPLATE")
 if not NSFW_URL_TEMPLATE:
     raise ValueError("Missing Secret: NSFW_API_URL_TEMPLATE is not set in Hugging Face Space secrets.")
 if not TTS_URL_TEMPLATE:
     raise ValueError("Missing Secret: TTS_API_URL_TEMPLATE is not set in Hugging Face Space secrets.")
 # VOICES
 VOICES = [
+    "alloy", "echo", "fable", "onyx", "nova", "shimmer",
+    "coral", "verse", "ballad", "ash", "sage", "amuch", "dan"
 ]
+# EMOTION CATEGORIES
+EMOTION_CATEGORIES = {
+    "Common": ["neutral", "happy", "sad", "excited", "angry", "calm", "fearful", "joyful", "surprised"],
+    "Complex": [
+        "sarcastic", "sarcastic and mocking", "sad and depressed, with stammering",
+        "excited and joyful", "angry and frustrated", "calm and soothing",
+        "nervous and anxious", "happy and relieved", "fearful and tense"
+    ]
+}
+# Para limpiar archivos temporales
+temp_files = []
+def cleanup_temp_files():
+    for f in temp_files:
+        try:
+            os.remove(f)
+        except:
+            pass
+atexit.register(cleanup_temp_files)
+# Función NSFW
 def check_nsfw(prompt: str) -> bool:
     try:
         encoded_prompt = urllib.parse.quote(prompt)
         url = NSFW_URL_TEMPLATE.format(prompt=encoded_prompt)
         response = requests.get(url, timeout=20)
         response.raise_for_status()
         result = response.text.strip().upper()
+        return result == "YES"
     except requests.exceptions.RequestException as e:
         print(f"Error during NSFW check: {e}")
+        return True
     except Exception as e:
         print(f"Unexpected error during NSFW check: {e}")
+        return True
+# Generación de audio con reintentos
 def generate_audio(prompt: str, voice: str, emotion: str, seed: int) -> bytes:
+    encoded_prompt = urllib.parse.quote(prompt)
+    encoded_emotion = urllib.parse.quote(emotion)
+    url = TTS_URL_TEMPLATE.format(
+        prompt=encoded_prompt,
+        emotion=encoded_emotion,
+        voice=voice,
+        seed=seed
+    )
+    for attempt in range(3):
+        try:
+            response = requests.get(url, timeout=60)
+            response.raise_for_status()
+            content_type = response.headers.get('content-type', '').lower()
+            if 'audio' not in content_type:
+                print(f"Warning: Unexpected content type: {content_type}")
+                print(f"Response Text: {response.text[:500]}")
+                raise gr.Error("API did not return audio.")
+            return response.content
+        except requests.exceptions.RequestException as e:
+            print(f"Attempt {attempt+1} failed: {e}")
+            if attempt == 2:
+                raise gr.Error("Failed to generate audio after 3 attempts.")
+            time.sleep(1)
+# Función principal
 def text_to_speech_app(prompt: str, voice: str, emotion: str, use_random_seed: bool, specific_seed: int):
     if not prompt:
         raise gr.Error("Prompt cannot be empty.")
     if not emotion:
         emotion = "neutral"
     if not voice:
+        raise gr.Error("Please select a voice.")
     seed = random.randint(0, 2**32 - 1) if use_random_seed else int(specific_seed)
+    # Check NSFW (opcional)
+    # is_nsfw = check_nsfw(prompt)
+    is_nsfw = False
     if is_nsfw:
         return None, "Error: The prompt was flagged as inappropriate and cannot be processed."
     try:
         audio_bytes = generate_audio(prompt, voice, emotion, seed)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio_file:
             temp_audio_file.write(audio_bytes)
             temp_file_path = temp_audio_file.name
+            temp_files.append(temp_file_path)
         return temp_file_path, f"Audio generated successfully with voice '{voice}', emotion '{emotion}', and seed {seed}."
     except gr.Error as e:
+        return None, str(e)
     except Exception as e:
+        return None, f"Unexpected error: {e}"
+# Sincronización dropdown <-> textbox
+def update_emotion_textbox(selected_emotion):
+    return selected_emotion
+# Actualizar dropdown según categoría
+def update_emotion_options(category):
+    return gr.update(choices=EMOTION_CATEGORIES[category], value=EMOTION_CATEGORIES[category][0])
+# Toggle seed input
 def toggle_seed_input(use_random_seed):
     return gr.update(visible=not use_random_seed, value=12345)
+# Gradio UI
 with gr.Blocks() as app:
     gr.Markdown("# Advanced OpenAI Text-To-Speech Unlimited")
+    gr.Markdown("Enter text, choose a voice and emotion, and generate audio.")
     with gr.Row():
         with gr.Column(scale=2):
+            prompt_input = gr.Textbox(label="Prompt", placeholder="Type the text here...")
+            with gr.Row():
+                emotion_input = gr.Textbox(label="Emotion Style", placeholder="Type an emotion or select from the dropdown...")
+                with gr.Column():
+                    category_dropdown = gr.Dropdown(label="Emotion Category", choices=list(EMOTION_CATEGORIES.keys()), value="Common", interactive=True)
+                    emotion_dropdown = gr.Dropdown(label="Select Emotion", choices=EMOTION_CATEGORIES["Common"], value="neutral", interactive=True)
         with gr.Column(scale=1):
+            voice_dropdown = gr.Dropdown(label="Voice", choices=VOICES, value="alloy")
             random_seed_checkbox = gr.Checkbox(label="Use Random Seed", value=True)
             seed_input = gr.Number(label="Specific Seed", value=12345, visible=False, precision=0)
         audio_output = gr.Audio(label="Generated Audio", type="filepath")
         status_output = gr.Textbox(label="Status")
+    # Eventos
+    category_dropdown.change(fn=update_emotion_options, inputs=[category_dropdown], outputs=[emotion_dropdown])
+    emotion_dropdown.change(fn=update_emotion_textbox, inputs=[emotion_dropdown], outputs=[emotion_input])
+    random_seed_checkbox.change(fn=toggle_seed_input, inputs=[random_seed_checkbox], outputs=[seed_input])
     submit_button.click(
         fn=text_to_speech_app,
+        inputs=[prompt_input, voice_dropdown, emotion_input, random_seed_checkbox, seed_input],
         outputs=[audio_output, status_output],
         concurrency_limit=30
     )
     gr.Examples(
         examples=[
             ["Hello there! This is a test of the text-to-speech system.", "alloy", "neutral", False, 12345],
         inputs=[prompt_input, voice_dropdown, emotion_input, random_seed_checkbox, seed_input],
         outputs=[audio_output, status_output],
         fn=text_to_speech_app,
+        cache_examples=False,
     )
 if __name__ == "__main__":
     if NSFW_URL_TEMPLATE and TTS_URL_TEMPLATE:
         app.launch()
     else:
         print("ERROR: Cannot launch app. Required API URL secrets are missing.")