Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Sleeping

App Files Files Community

Yilin0601 commited on Mar 25

Commit

be4098e

verified ·

1 Parent(s): 1ce7fad

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -39

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import torch
 import numpy as np
 import librosa
 from transformers import pipeline
-from melo.api import TTS
 # --------------------------------------------------
 # ASR Pipeline (for English transcription)
@@ -14,7 +14,7 @@ asr = pipeline(
 )
 # --------------------------------------------------
-# Mapping for Target Languages and Models
 # --------------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
@@ -29,8 +29,6 @@ translation_models = {
     "Korean": "Helsinki-NLP/opus-mt-en-ko"
 }
-# Each language often requires a specific pipeline task name
-# (e.g., "translation_en_to_zh" rather than "translation_en_to_chinese")
 translation_tasks = {
     "Spanish": "translation_en_to_es",
     "French": "translation_en_to_fr",
@@ -44,18 +42,20 @@ translation_tasks = {
     "Korean": "translation_en_to_ko"
 }
-# TTS models (some may not exist or may be unofficial)
 tts_models = {
-    "Spanish": "myshell-ai/MeloTTS-Spanish",
-    "French": "myshell-ai/MeloTTS-French",
-    "German": "tts_models/de/tacotron2",
-    "Chinese": "myshell-ai/MeloTTS-English-v2",     # Verify if this actually exists on Hugging Face
-    "Russian": "tts_models/ru/tacotron2",     # Same note
-    "Arabic": "tts_models/ar/tacotron2",      # Same note
-    "Portuguese": "tts_models/pt/tacotron2",  # Same note
-    "Japanese": "myshell-ai/MeloTTS-Japanese",    # Same note
-    "Italian": "tts_models/it/tacotron2",     # Same note
-    "Korean": "myshell-ai/MeloTTS-Korean"       # Same note
 }
 # --------------------------------------------------
@@ -73,31 +73,28 @@ def get_translator(target_language):
     model_name = translation_models[target_language]
     task_name = translation_tasks[target_language]
     translator = pipeline(task_name, model=model_name)
     translator_cache[target_language] = translator
     return translator
 def get_tts(target_language):
     """
-    Retrieve or create a TTS pipeline for the specified language, if available.
     """
     if target_language in tts_cache:
         return tts_cache[target_language]
     model_name = tts_models.get(target_language)
     if model_name is None:
-        # If no TTS model is mapped, raise an error or handle gracefully
         raise ValueError(f"No TTS model available for {target_language}.")
     try:
         tts_pipeline = pipeline("text-to-speech", model=model_name)
     except Exception as e:
         raise ValueError(
-            f"Failed to load TTS model for {target_language}. "
-            f"Make sure '{model_name}' exists on Hugging Face.\nError: {e}"
         )
     tts_cache[target_language] = tts_pipeline
     return tts_pipeline
@@ -110,47 +107,38 @@ def predict(audio, text, target_language):
     2. Translate English -> target_language.
     3. Synthesize speech in target_language.
     """
-    # 1. English text from text input (if provided), else from audio via ASR
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
-        # Ensure the audio is float32 for librosa
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
-        # Convert stereo to mono if needed
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
-        # Resample to 16 kHz if necessary
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         input_audio = {"array": audio_data, "sampling_rate": 16000}
         asr_result = asr(input_audio)
         english_text = asr_result["text"]
     else:
         return "No input provided.", "", None
-    # 2. Translation step
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
         translated_text = translation_result[0]["translation_text"]
     except Exception as e:
-        # If there's an error in translation, return partial results
         return english_text, f"Translation error: {e}", None
-    # 3. TTS step: synthesize speech from the translated text
     try:
         tts_pipeline = get_tts(target_language)
         tts_result = tts_pipeline(translated_text)
-        # The TTS pipeline returns a dict with "wav" and "sample_rate"
         synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
     except Exception as e:
-        # If TTS fails, return partial results
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, synthesized_audio
@@ -172,13 +160,12 @@ iface = gr.Interface(
     ],
     title="Multimodal Language Learning Aid",
     description=(
-        "This app helps language learners by providing three outputs:\n"
         "1. English transcription (from ASR or text input),\n"
         "2. Translation to a target language (using Helsinki-NLP models), and\n"
-        "3. Synthetic speech in the target language.\n\n"
         "Select one of the top 10 commonly used languages from the dropdown.\n"
-        "Either record/upload an English audio sample or enter English text directly.\n\n"
-        "Note: Some TTS models may not exist or be unstable for certain languages."
     ),
     allow_flagging="never"
 )

 import numpy as np
 import librosa
 from transformers import pipeline
+import scipy  # imported if needed for processing
 # --------------------------------------------------
 # ASR Pipeline (for English transcription)
 )
 # --------------------------------------------------
+# Mapping for Target Languages and Translation Pipelines
 # --------------------------------------------------
 translation_models = {
     "Spanish": "Helsinki-NLP/opus-mt-en-es",
     "Korean": "Helsinki-NLP/opus-mt-en-ko"
 }
 translation_tasks = {
     "Spanish": "translation_en_to_es",
     "French": "translation_en_to_fr",
     "Korean": "translation_en_to_ko"
 }
+# --------------------------------------------------
+# TTS Models (using real Facebook MMS TTS & others)
+# --------------------------------------------------
 tts_models = {
+    "Spanish": "facebook/mms-tts-spa",
+    "French": "facebook/mms-tts-fra",
+    "German": "facebook/mms-tts-deu",
+    "Chinese": "facebook/mms-tts-che",
+    "Russian": "facebook/mms-tts-rus",
+    "Arabic": "facebook/mms-tts-ara",
+    "Portuguese": "facebook/mms-tts-por",
+    "Japanese": "esnya/japanese_speecht5_tts",
+    "Italian": "tts_models/it/tacotron2",
+    "Korean": "facebook/mms-tts-kor"
 }
 # --------------------------------------------------
     model_name = translation_models[target_language]
     task_name = translation_tasks[target_language]
     translator = pipeline(task_name, model=model_name)
     translator_cache[target_language] = translator
     return translator
 def get_tts(target_language):
     """
+    Retrieve or create a TTS pipeline for the specified language.
     """
     if target_language in tts_cache:
         return tts_cache[target_language]
     model_name = tts_models.get(target_language)
     if model_name is None:
         raise ValueError(f"No TTS model available for {target_language}.")
     try:
         tts_pipeline = pipeline("text-to-speech", model=model_name)
     except Exception as e:
         raise ValueError(
+            f"Failed to load TTS model for {target_language} with model '{model_name}'.\nError: {e}"
         )
     tts_cache[target_language] = tts_pipeline
     return tts_pipeline
     2. Translate English -> target_language.
     3. Synthesize speech in target_language.
     """
+    # Step 1: Get English text from text input (if provided) or from ASR.
     if text.strip():
         english_text = text.strip()
     elif audio is not None:
         sample_rate, audio_data = audio
         if audio_data.dtype not in [np.float32, np.float64]:
             audio_data = audio_data.astype(np.float32)
         if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
             audio_data = np.mean(audio_data, axis=1)
         if sample_rate != 16000:
             audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
         input_audio = {"array": audio_data, "sampling_rate": 16000}
         asr_result = asr(input_audio)
         english_text = asr_result["text"]
     else:
         return "No input provided.", "", None
+    # Step 2: Translation
     translator = get_translator(target_language)
     try:
         translation_result = translator(english_text)
         translated_text = translation_result[0]["translation_text"]
     except Exception as e:
         return english_text, f"Translation error: {e}", None
+    # Step 3: TTS synthesis using Facebook MMS TTS (or alternative) pipeline.
     try:
         tts_pipeline = get_tts(target_language)
         tts_result = tts_pipeline(translated_text)
+        # Expected output: a dict with "wav" and "sample_rate"
         synthesized_audio = (tts_result["sample_rate"], tts_result["wav"])
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, synthesized_audio
     ],
     title="Multimodal Language Learning Aid",
     description=(
+        "This app provides three outputs:\n"
         "1. English transcription (from ASR or text input),\n"
         "2. Translation to a target language (using Helsinki-NLP models), and\n"
+        "3. Synthetic speech in the target language (using Facebook MMS TTS or equivalent).\n\n"
         "Select one of the top 10 commonly used languages from the dropdown.\n"
+        "Either record/upload an English audio sample or enter English text directly."
     ),
     allow_flagging="never"
 )