Spaces:

Futuresony
/

Project_1

Runtime error

App Files Files Community

Futuresony commited on Aug 10

Commit

326df18

verified ·

1 Parent(s): 727f54d

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -28

app.py CHANGED Viewed

@@ -2,31 +2,34 @@ import gradio as gr
 import torch
 import torchaudio
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-# from huggingface_hub import InferenceClient # Removed
 from ttsmms import download, TTS
 from langdetect import detect
-from gradio_client import Client # Added
 # Load ASR Model
 asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"
 processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
 asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
-# Load Text Generation Model - Using Gradio Client
-# client = InferenceClient("unsloth/gemma-3-1b-it") # Removed
-llm_client = Client("Futuresony/Mr.Events") # Added
-# def format_prompt(user_input): # Removed
-#     return f"{user_input}" # Removed
 # Load TTS Models
 swahili_dir = download("swh", "./data/swahili")
 english_dir = download("eng", "./data/english")
 swahili_tts = TTS(swahili_dir)
 english_tts = TTS(english_dir)
 # ASR Function
 def transcribe(audio_file):
     speech_array, sample_rate = torchaudio.load(audio_file)
     resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
@@ -38,39 +41,62 @@ def transcribe(audio_file):
     transcription = processor.batch_decode(predicted_ids)[0]
     return transcription
-# Text Generation Function - Using Gradio Client
 def generate_text(prompt):
-    # formatted_prompt = format_prompt(prompt) # Removed
-    # response = client.text_generation(formatted_prompt, max_new_tokens=250, temperature=0.7, top_p=0.95) # Removed
-    print(f"Generating text for prompt (type: {type(prompt)}): {prompt}") # Debug print
-    result = llm_client.predict(query=prompt, api_name="/chat") # Added
-    print(f"Generated text result (type: {type(result)}): {result}") # Debug print
-    return result.strip() # Modified to return the result from the Gradio Client
 # TTS Function
 def text_to_speech(text):
-    print(f"Converting text to speech (type: {type(text)}): {text}") # Debug print
     lang = detect(text)
     wav_path = "./output.wav"
-    if lang == "sw":
-        swahili_tts.synthesis(text, wav_path=wav_path)
-    else:
-        english_tts.synthesis(text, wav_path=wav_path)
-    print(f"TTS output path (type: {type(wav_path)}): {wav_path}") # Debug print
     return wav_path
 # Combined Processing Function
 def process_audio(audio):
-    print(f"Processing audio file (type: {type(audio)}): {audio}") # Debug print
     transcription = transcribe(audio)
-    print(f"Transcription result (type: {type(transcription)}): {transcription}") # Debug print
     generated_text = generate_text(transcription)
-    print(f"Generated text after function call (type: {type(generated_text)}): {generated_text}") # Debug print
-    speech = text_to_speech(generated_text)
-    print(f"Speech output after function call (type: {type(speech)}): {speech}") # Debug print
-    return transcription, generated_text, speech
 # Gradio Interface
 with gr.Blocks() as demo:
     gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
     gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")
@@ -88,4 +114,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.launch()

 import torch
 import torchaudio
 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 from ttsmms import download, TTS
 from langdetect import detect
+from gradio_client import Client
+# =========================
 # Load ASR Model
+# =========================
 asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"
 processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
 asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
+# =========================
+# Load Text Generation Model via Gradio Client
+# =========================
+llm_client = Client("Futuresony/Mr.Events")
+# =========================
 # Load TTS Models
+# =========================
 swahili_dir = download("swh", "./data/swahili")
 english_dir = download("eng", "./data/english")
 swahili_tts = TTS(swahili_dir)
 english_tts = TTS(english_dir)
+# =========================
 # ASR Function
+# =========================
 def transcribe(audio_file):
     speech_array, sample_rate = torchaudio.load(audio_file)
     resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
     transcription = processor.batch_decode(predicted_ids)[0]
     return transcription
+# =========================
+# Text Generation Function (Safe)
+# =========================
 def generate_text(prompt):
+    print(f"[DEBUG] Generating text for prompt: {prompt} (type: {type(prompt)})")
+    result = llm_client.predict(query=prompt, api_name="/chat")
+    print(f"[DEBUG] /chat returned: {result} (type: {type(result)})")
+    # Ensure result is always a string
+    if not isinstance(result, str):
+        try:
+            result = " ".join(map(str, result)) if isinstance(result, (list, tuple)) else str(result)
+        except Exception as e:
+            print(f"[ERROR] Failed to convert result to string: {e}")
+            result = "Error: Unable to generate text."
+    return result.strip()
+# =========================
 # TTS Function
+# =========================
 def text_to_speech(text):
+    print(f"[DEBUG] Converting text to speech: {text} (type: {type(text)})")
     lang = detect(text)
     wav_path = "./output.wav"
+    try:
+        if lang == "sw":
+            swahili_tts.synthesis(text, wav_path=wav_path)
+        else:
+            english_tts.synthesis(text, wav_path=wav_path)
+    except Exception as e:
+        print(f"[ERROR] TTS synthesis failed: {e}")
+        return None
     return wav_path
+# =========================
 # Combined Processing Function
+# =========================
 def process_audio(audio):
+    print(f"[DEBUG] Processing audio: {audio} (type: {type(audio)})")
     transcription = transcribe(audio)
+    print(f"[DEBUG] Transcription: {transcription}")
     generated_text = generate_text(transcription)
+    print(f"[DEBUG] Generated Text: {generated_text}")
+    speech_path = text_to_speech(generated_text)
+    print(f"[DEBUG] Speech Path: {speech_path}")
+    return transcription, generated_text, speech_path
+# =========================
 # Gradio Interface
+# =========================
 with gr.Blocks() as demo:
     gr.Markdown("<p align='center' style='font-size: 20px;'>End-to-End ASR, Text Generation, and TTS</p>")
     gr.HTML("<center>Upload or record audio. The model will transcribe, generate a response, and read it out.</center>")
     )
 if __name__ == "__main__":
+    demo.launch()