Spaces:

Futuresony
/

Project_1

Runtime error

App Files Files Community

Futuresony commited on Aug 10

Commit

326e81a

verified ·

1 Parent(s): 9f14aa7

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -337

app.py DELETED Viewed

@@ -1,337 +0,0 @@
-# app.py
-import os
-import tempfile
-import traceback
-from dataclasses import dataclass, field
-from typing import Any, List, Tuple, Optional
-import gradio as gr
-import numpy as np
-import soundfile as sf
-import torchaudio
-import torch
-from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
-from gradio_client import Client
-from ttsmms import download, TTS
-from langdetect import detect
-# ========================
-# CONFIG - update as needed
-# ========================
-# Local ASR model (change to correct HF repo id or local path)
-asr_model_name = "Futuresony/Future-sw_ASR-24-02-2025"
-# Remote LLM Gradio Space
-llm_space = "Futuresony/Mr.Events"
-llm_api_name = "/chat"
-# TTS languages
-sw_lang_code = "swh"  # ttsmms language code for Swahili (adjust if needed)
-en_lang_code = "eng"
-# ========================
-# LOAD MODELS / CLIENTS
-# ========================
-print("[INIT] Loading ASR processor & model...")
-processor = Wav2Vec2Processor.from_pretrained(asr_model_name)
-asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_name)
-asr_model.eval()
-print("[INIT] Creating Gradio Client for LLM Space...")
-llm_client = Client(llm_space)
-print("[INIT] Downloading TTS models (this may take time)")
-swahili_dir = download(sw_lang_code, "./data/swahili")
-english_dir = download(en_lang_code, "./data/english")
-swahili_tts = TTS(swahili_dir)
-english_tts = TTS(english_dir)
-# ========================
-# APP STATE
-# ========================
-@dataclass
-class AppState:
-    conversation: List[dict] = field(default_factory=list)
-    last_transcription: Optional[str] = None
-    last_reply: Optional[str] = None
-    last_wav: Optional[str] = None
-# ========================
-# UTIL: Safe LLM call
-# ========================
-def safe_predict(prompt: str, api_name: str = llm_api_name, timeout: int = 30) -> str:
-    """
-    Calls gradio_client.Client.predict() but defends against:
-    - gradio_client JSON schema parsing errors
-    - endpoints returning bool/list/tuple/dict
-    - other exceptions
-    Always returns a string (never bool or non-iterable).
-    """
-    try:
-        result = llm_client.predict(query=prompt, api_name=api_name)
-        print(f"[LLM] raw result: {repr(result)} (type={type(result)})")
-    except Exception as e:
-        # If gradio_client fails (schema issues etc.), catch and return an error message
-        print("[LLM] predict() raised an exception:")
-        traceback.print_exc()
-        return f"Error: could not contact LLM endpoint ({str(e)})"
-    # Convert whatever we got into a string safely
-    if isinstance(result, str):
-        return result.strip()
-    if isinstance(result, (list, tuple)):
-        try:
-            return " ".join(map(str, result)).strip()
-        except Exception:
-            return str(result)
-    # For bool/dict/None/other -> stringify
-    try:
-        return str(result).strip()
-    except Exception as e:
-        print("[LLM] Failed to stringify result:", e)
-        return "Error: LLM returned an unsupported type."
-# ========================
-# ASR (Wav2Vec2) helpers
-# ========================
-def write_temp_wav_from_gr_numpy(audio_tuple: Tuple[np.ndarray, int]) -> str:
-    """
-    Gradio audio (type='numpy') yields (np_array, sample_rate).
-    np_array shape: (n_samples, n_channels) or (n_samples,)
-    We'll write to a temporary WAV file using soundfile, and return path.
-    """
-    array, sr = audio_tuple
-    if array is None:
-        raise ValueError("Empty audio")
-    # If stereo, convert to mono by averaging channels
-    if array.ndim == 2:
-        array = np.mean(array, axis=1)
-    tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-    tmp_name = tmp.name
-    tmp.close()
-    sf.write(tmp_name, array, sr)
-    return tmp_name
-def transcribe_wav_file(wav_path: str) -> str:
-    """Load with torchaudio (for resampling if needed), then transcribe."""
-    waveform, sr = torchaudio.load(wav_path)  # waveform: (channels, samples)
-    # convert to mono
-    if waveform.shape[0] > 1:
-        waveform = torch.mean(waveform, dim=0, keepdim=True)
-    waveform = waveform.squeeze(0).numpy()
-    # resample if necessary
-    if sr != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-        waveform = resampler(torch.from_numpy(waveform)).numpy()
-    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        logits = asr_model(inputs.input_values).logits
-    predicted_ids = torch.argmax(logits, dim=-1)
-    transcription = processor.batch_decode(predicted_ids)[0]
-    return transcription
-# ========================
-# TTS helper
-# ========================
-def synthesize_text_to_wav(text: str) -> Optional[str]:
-    """Detect language and synthesize to ./output.wav (overwrites each call)."""
-    if not text:
-        return None
-    try:
-        lang = detect(text)
-    except Exception:
-        lang = "en"
-    wav_path = "./output.wav"
-    try:
-        if lang and lang.startswith("sw"):
-            swahili_tts.synthesis(text, wav_path=wav_path)
-        else:
-            english_tts.synthesis(text, wav_path=wav_path)
-        return wav_path
-    except Exception as e:
-        print("[TTS] synthesis failed:", e)
-        traceback.print_exc()
-        return None
-# ========================
-# GRPC/HTTP flow functions (for Gradio event hooks)
-# ========================
-def process_audio_start(audio: Tuple[np.ndarray, int], state: AppState):
-    """
-    Called when recording starts/stops depending on how you wire events.
-    We'll transcribe the incoming audio and append the user message to conversation.
-    Returns updated state and the latest transcription (so UI can show it).
-    """
-    try:
-        if audio is None:
-            return state, ""
-        wav = write_temp_wav_from_gr_numpy(audio)
-        transcription = transcribe_wav_file(wav)
-        print(f"[ASR] transcription: {transcription!r}")
-        state.last_transcription = transcription
-        # append user message for context
-        state.conversation.append({"role": "user", "content": transcription})
-        # cleanup temp wav
-        try:
-            os.remove(wav)
-        except Exception:
-            pass
-        return state, transcription
-    except Exception as e:
-        print("[ASR] error:", e)
-        traceback.print_exc()
-        return state, f"Error in transcription: {str(e)}"
-def generate_reply_stop(state: AppState):
-    """
-    Called after transcription is present in state (i.e. on stop_recording).
-    Generates a reply with safe_predict, appends to conversation, synthesizes TTS,
-    and returns updated state, the chat history (for Chatbot), and the output wav path.
-    """
-    try:
-        # Build messages for the LLM from state.conversation
-        # (prefix with system prompt for diet calorie assistant as earlier)
-        system_prompt = (
-            "In conversation with the user, ask questions to estimate and provide (1) total calories, "
-            "(2) protein, carbs, and fat in grams, (3) fiber and sugar content. Only ask one question at a time. "
-            "Be conversational and natural."
-        )
-        messages = [ {"role": "system", "content": system_prompt} ] + state.conversation
-        # Convert messages to a single text prompt for the remote space, if your remote space expects `query` plain text.
-        # If your remote space accepts structured messages, adapt accordingly.
-        # We'll join messages into a single friendly prompt (safe fallback).
-        prompt_text = ""
-        for m in messages:
-            role = m.get("role", "user")
-            content = m.get("content", "")
-            prompt_text += f"[{role}] {content}\n"
-        reply_text = safe_predict(prompt_text, api_name=llm_api_name)
-        print("[LLM] reply:", reply_text)
-        # Add assistant reply to conversation
-        state.conversation.append({"role": "assistant", "content": reply_text})
-        state.last_reply = reply_text
-        # Synthesize to wav (TTS)
-        wav_path = synthesize_text_to_wav(reply_text)
-        state.last_wav = wav_path
-        # Build chatbot history for gr.Chatbot (list of tuples (user, bot) or messages)
-        # gr.Chatbot expects list of (user_msg, bot_msg) pairs; we'll convert conversation
-        # into that form:
-        pairs = []
-        # collapse conversation into pairs
-        user_msgs = []
-        bot_msgs = []
-        # simple converter: walk conversation and pair each user with next assistant
-        conv = state.conversation
-        i = 0
-        while i < len(conv):
-            if conv[i]["role"] == "user":
-                user = conv[i]["content"]
-                # look ahead for assistant
-                assistant = ""
-                if i + 1 < len(conv) and conv[i+1]["role"] == "assistant":
-                    assistant = conv[i+1]["content"]
-                    i += 1
-                pairs.append((user, assistant))
-            i += 1
-        return state, pairs, wav_path
-    except Exception as e:
-        print("[LLM/TTS] error:", e)
-        traceback.print_exc()
-        return state, [("error", f"Error generating reply: {str(e)}")], None
-# ========================
-# CLIENT-SIDE VAD JS (embedded)
-# ========================
-custom_js = r"""
-async function main() {
-  // Load ONNX runtime and VAD library dynamically
-  const script1 = document.createElement("script");
-  script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
-  document.head.appendChild(script1);
-  const script2 = document.createElement("script");
-  script2.onload = async () =>  {
-    console.log("VAD loaded");
-    var record = document.querySelector('.record-button');
-    if (record) record.textContent = "Just Start Talking!";
-    // create MicVAD and auto click the record/stop buttons
-    try {
-      const myvad = await vad.MicVAD.new({
-        onSpeechStart: () => {
-          var record = document.querySelector('.record-button');
-          var player = document.querySelector('#streaming-out');
-          if (record && (!player || player.paused)) {
-            record.click();
-          }
-        },
-        onSpeechEnd: () => {
-          var stop = document.querySelector('.stop-button');
-          if (stop) stop.click();
-        }
-      });
-      myvad.start();
-    } catch (e) {
-      console.warn("VAD init failed:", e);
-    }
-  };
-  script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.7/dist/bundle.min.js";
-  document.head.appendChild(script2);
-}
-main();
-"""
-# ========================
-# BUILD GRADIO UI
-# ========================
-with gr.Blocks(js=custom_js, title="ASR → LLM → TTS (Safe)") as demo:
-    gr.Markdown("## Speak: ASR → LLM → TTS (defensive, production-friendly)")
-    state = gr.State(AppState())
-    with gr.Row():
-        input_audio = gr.Audio(
-            label="🎙 Speak (microphone)",
-            source="microphone",      # <-- Added source argument here
-            type="numpy",
-            streaming=False,
-            show_label=True,
-        )
-    with gr.Row():
-        transcription_out = gr.Textbox(label="Transcription", interactive=False)
-    with gr.Row():
-        chatbot = gr.Chatbot(label="Conversation")
-    with gr.Row():
-        output_audio = gr.Audio(label="Assistant speech (TTS)", type="filepath")
-    # Wire events:
-    # When recording starts/stops - process transcription and update UI
-    input_audio.start_recording(
-        fn=process_audio_start,
-        inputs=[input_audio, state],
-        outputs=[state, transcription_out],
-    )
-    # When recording stops - generate reply and update chatbot + audio output
-    input_audio.stop_recording(
-        fn=generate_reply_stop,
-        inputs=[state],
-        outputs=[state, chatbot, output_audio],
-    )
-    # Manual trigger button to generate reply if needed
-    gen_btn = gr.Button("Generate reply (manual)")
-    gen_btn.click(fn=generate_reply_stop, inputs=[state], outputs=[state, chatbot, output_audio])
-# ========================
-# LAUNCH
-# ========================
-if __name__ == "__main__":
-    demo.launch()