Spaces:

Yilin0601
/

Multimodal_Language_Learning_Aid

Sleeping

App Files Files Community

Yilin0601 commited on Mar 26

Commit

178dac1

verified ·

1 Parent(s): 2334caf

Update app.py

Browse files

Files changed (1) hide show

app.py +110 -84

app.py CHANGED Viewed

@@ -2,12 +2,24 @@ import gradio as gr
 import torch
 import numpy as np
 import librosa
-import soundfile as sf  # likely needed by the pipeline or local saving
-from transformers import pipeline, VitsModel, AutoTokenizer
-from datasets import load_dataset
 # ------------------------------------------------------
-# 1. ASR Pipeline (English) - Wav2Vec2
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
@@ -30,28 +42,32 @@ translation_tasks = {
 }
 # ------------------------------------------------------
-# 3. TTS Configuration
-#    - Spanish: VITS-based MMS TTS
-#    - Chinese & Japanese: Microsoft SpeechT5
 # ------------------------------------------------------
-# We'll store them as keys for convenience
-SPANISH_KEY = "Spanish"
-CHINESE_KEY = "Chinese"
-JAPANESE_KEY = "Japanese"
-# VITS config for Spanish only
 mms_spanish_config = {
-    "model_id": "facebook/mms-tts-spa",
     "architecture": "vits"
 }
 # ------------------------------------------------------
-# 4. Create TTS Pipelines / Models Once (Caching)
 # ------------------------------------------------------
 translator_cache = {}
-vits_model_cache = None  # for Spanish
-speech_t5_pipeline_cache = None  # for Chinese/Japanese
-speech_t5_speaker_embedding = None
 def get_translator(lang):
     """
@@ -65,91 +81,99 @@ def get_translator(lang):
     translator_cache[lang] = translator
     return translator
 def load_spanish_vits():
     """
-    Load and cache the Spanish VITS model + tokenizer (facebook/mms-tts-spa).
     """
-    global vits_model_cache
-    if vits_model_cache is not None:
-        return vits_model_cache
     try:
-        model_id = mms_spanish_config["model_id"]
-        model = VitsModel.from_pretrained(model_id)
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        vits_model_cache = (model, tokenizer)
     except Exception as e:
         raise RuntimeError(f"Failed to load Spanish TTS model {mms_spanish_config['model_id']}: {e}")
-    return vits_model_cache
-def load_speech_t5_pipeline():
-    """
-    Load and cache the Microsoft SpeechT5 text-to-speech pipeline
-    and a default speaker embedding.
-    """
-    global speech_t5_pipeline_cache, speech_t5_speaker_embedding
-    if speech_t5_pipeline_cache is not None and speech_t5_speaker_embedding is not None:
-        return speech_t5_pipeline_cache, speech_t5_speaker_embedding
-    try:
-        # Create the pipeline
-        # The pipeline is named "text-to-speech" in Transformers >= 4.29
-        t5_pipe = pipeline("text-to-speech", model="microsoft/speecht5_tts")
-    except Exception as e:
-        raise RuntimeError(f"Failed to load Microsoft SpeechT5 pipeline: {e}")
-    # Load a default speaker embedding
-    try:
-        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
-        # Just pick an arbitrary index for speaker embedding
-        speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
-    except Exception as e:
-        raise RuntimeError(f"Failed to load default speaker embedding: {e}")
-    speech_t5_pipeline_cache = t5_pipe
-    speech_t5_speaker_embedding = speaker_embedding
-    return t5_pipe, speaker_embedding
-# ------------------------------------------------------
-# 5. TTS Inference Helpers
-# ------------------------------------------------------
-def run_vits_inference(text):
     """
-    For Spanish TTS using MMS (facebook/mms-tts-spa).
     """
     model, tokenizer = load_spanish_vits()
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = model(**inputs)
     if not hasattr(output, "waveform"):
-        raise RuntimeError("VITS output does not contain 'waveform'.")
     waveform = output.waveform.squeeze().cpu().numpy()
     sample_rate = 16000
     return sample_rate, waveform
-def run_speecht5_inference(text):
     """
-    For Chinese & Japanese TTS using Microsoft SpeechT5 pipeline.
     """
-    t5_pipe, speaker_embedding = load_speech_t5_pipeline()
-    # The pipeline returns a dict with 'audio' (numpy) and 'sampling_rate'
-    result = t5_pipe(
-        text,
-        forward_params={"speaker_embeddings": speaker_embedding}
-    )
-    waveform = result["audio"]
-    sample_rate = result["sampling_rate"]
-    return sample_rate, waveform
 # ------------------------------------------------------
-# 6. Main Prediction Function
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
     1. Get English text (ASR if audio provided, else text).
     2. Translate to target_language.
-    3. TTS with the chosen approach (VITS for Spanish, SpeechT5 for Chinese/Japanese).
     """
     # Step 1: English text
     if text.strip():
@@ -185,25 +209,25 @@ def predict(audio, text, target_language):
     # Step 3: TTS
     try:
-        if target_language == SPANISH_KEY:
-            sr, waveform = run_vits_inference(translated_text)
         else:
-            # Chinese or Japanese -> SpeechT5
-            sr, waveform = run_speecht5_inference(translated_text)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, (sr, waveform)
 # ------------------------------------------------------
-# 7. Gradio Interface
 # ------------------------------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
         gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
-        gr.Dropdown(choices=["Spanish", "Chinese", "Japanese"], value="Spanish", label="Target Language")
     ],
     outputs=[
         gr.Textbox(label="English Transcription"),
@@ -212,14 +236,16 @@ iface = gr.Interface(
     ],
     title="Multimodal Language Learning Aid",
     description=(
-        "1. Transcribes English speech using Wav2Vec2-960h (or takes English text).\n"
-        "2. Translates to Spanish, Chinese, or Japanese.\n"
-        "3. Provides synthetic speech:\n"
         "   - Spanish -> facebook/mms-tts-spa (VITS)\n"
-        "   - Chinese & Japanese -> microsoft/speecht5_tts (SpeechT5)\n\n"
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
-    iface.launch(server_name="0.0.0.0", server_port=7860)

 import torch
 import numpy as np
 import librosa
+import soundfile as sf
+import tempfile
+import os
+from transformers import (
+    pipeline,
+    VitsModel,
+    AutoTokenizer
+)
+# For Coqui TTS
+try:
+    from TTS.api import TTS as CoquiTTS
+except ImportError:
+    raise ImportError("Please install Coqui TTS via `pip install TTS`.")
 # ------------------------------------------------------
+# 1. ASR Pipeline (English) using Wav2Vec2
 # ------------------------------------------------------
 asr = pipeline(
     "automatic-speech-recognition",
 }
 # ------------------------------------------------------
+# 3. TTS Config:
+#    - Spanish: MMS TTS (facebook/mms-tts-spa)
+#    - Chinese, Japanese: Coqui XTTS-v2 (tts_models/multilingual/multi-dataset/xtts_v2)
 # ------------------------------------------------------
+SPANISH = "Spanish"
+CHINESE = "Chinese"
+JAPANESE = "Japanese"
+# For Spanish (MMS)
 mms_spanish_config = {
+    "model_id": "facebook/mms-tts-spa",
     "architecture": "vits"
 }
+# We'll map Chinese/Japanese to Coqui language codes
+coqui_lang_map = {
+    CHINESE: "zh",
+    JAPANESE: "ja"
+}
 # ------------------------------------------------------
+# 4. Global Caches
 # ------------------------------------------------------
 translator_cache = {}
+spanish_vits_cache = None
+coqui_tts_cache = None
 def get_translator(lang):
     """
     translator_cache[lang] = translator
     return translator
+# ------------------------------------------------------
+# 5. Spanish TTS: MMS (VITS)
+# ------------------------------------------------------
 def load_spanish_vits():
     """
+    Load and cache the Spanish MMS TTS model (VITS).
     """
+    global spanish_vits_cache
+    if spanish_vits_cache is not None:
+        return spanish_vits_cache
     try:
+        model = VitsModel.from_pretrained(mms_spanish_config["model_id"])
+        tokenizer = AutoTokenizer.from_pretrained(mms_spanish_config["model_id"])
+        spanish_vits_cache = (model, tokenizer)
     except Exception as e:
         raise RuntimeError(f"Failed to load Spanish TTS model {mms_spanish_config['model_id']}: {e}")
+    return spanish_vits_cache
+def run_spanish_tts(text):
     """
+    Run MMS TTS (VITS) for Spanish text.
+    Returns (sample_rate, waveform).
     """
     model, tokenizer = load_spanish_vits()
     inputs = tokenizer(text, return_tensors="pt")
     with torch.no_grad():
         output = model(**inputs)
     if not hasattr(output, "waveform"):
+        raise RuntimeError("Spanish TTS model output does not contain 'waveform'.")
     waveform = output.waveform.squeeze().cpu().numpy()
     sample_rate = 16000
     return sample_rate, waveform
+# ------------------------------------------------------
+# 6. Chinese/Japanese TTS: Coqui XTTS-v2
+# ------------------------------------------------------
+def load_coqui_tts():
     """
+    Load and cache the Coqui XTTS-v2 model (multilingual).
     """
+    global coqui_tts_cache
+    if coqui_tts_cache is not None:
+        return coqui_tts_cache
+    try:
+        # If you have a GPU on HF Spaces, you can set gpu=True.
+        # If not, set gpu=False to run on CPU (slower).
+        coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
+    except Exception as e:
+        raise RuntimeError("Failed to load Coqui XTTS-v2 TTS: %s" % e)
+    return coqui_tts_cache
+def run_coqui_tts(text, lang):
+    """
+    Run Coqui TTS for Chinese or Japanese text.
+    We specify the language code from coqui_lang_map.
+    Returns (sample_rate, waveform).
+    """
+    coqui_tts = load_coqui_tts()
+    lang_code = coqui_lang_map[lang]  # "zh" or "ja"
+    # We must output to a file, then read it back.
+    # Use a temporary file to store the wave.
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+        tmp_name = tmp.name
+    try:
+        coqui_tts.tts_to_file(
+            text=text,
+            file_path=tmp_name,
+            language=lang_code  # no speaker_wav, default voice
+        )
+        data, sr = sf.read(tmp_name)
+    finally:
+        # Cleanup the temporary file
+        if os.path.exists(tmp_name):
+            os.remove(tmp_name)
+    return sr, data
 # ------------------------------------------------------
+# 7. Main Prediction Function
 # ------------------------------------------------------
 def predict(audio, text, target_language):
     """
     1. Get English text (ASR if audio provided, else text).
     2. Translate to target_language.
+    3. TTS with the chosen approach:
+       - Spanish -> MMS TTS (VITS)
+       - Chinese/Japanese -> Coqui XTTS-v2
     """
     # Step 1: English text
     if text.strip():
     # Step 3: TTS
     try:
+        if target_language == SPANISH:
+            sr, waveform = run_spanish_tts(translated_text)
         else:
+            # Chinese or Japanese
+            sr, waveform = run_coqui_tts(translated_text, target_language)
     except Exception as e:
         return english_text, translated_text, f"TTS error: {e}"
     return english_text, translated_text, (sr, waveform)
 # ------------------------------------------------------
+# 8. Gradio Interface
 # ------------------------------------------------------
 iface = gr.Interface(
     fn=predict,
     inputs=[
         gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
         gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
+        gr.Dropdown(choices=[SPANISH, CHINESE, JAPANESE], value=SPANISH, label="Target Language")
     ],
     outputs=[
         gr.Textbox(label="English Transcription"),
     ],
     title="Multimodal Language Learning Aid",
     description=(
+        "1. Transcribes English speech using Wav2Vec2 (or takes English text).\n"
+        "2. Translates to Spanish, Chinese, or Japanese (via Helsinki-NLP).\n"
+        "3. Synthesizes speech:\n"
         "   - Spanish -> facebook/mms-tts-spa (VITS)\n"
+        "   - Chinese & Japanese -> Coqui XTTS-v2 (multilingual TTS)\n\n"
+        "Note: The Coqui model is 'tts_models/multilingual/multi-dataset/xtts_v2' and expects language codes.\n"
+        "If you need voice cloning, set `speaker_wav` in `tts_to_file()`. By default, it uses a single generic voice."
     ),
     allow_flagging="never"
 )
 if __name__ == "__main__":
+    iface.launch(server_name="0.0.0.0", server_port=7860)