Spaces:

IFMedTechdemo
/

Audio2Text

Running

File size: 2,735 Bytes

c72ac42

"""
Whisper Audio-to-Text – ZeroGPU edition
Runs on 🤗 Spaces with ZeroGPU (A100) accelerator
"""
import os
import tempfile
import gradio as gr
import whisper
import numpy as np
from huggingface_hub import hf_hub_download

# ------------------------------------------------------------------
# 1.  ZeroGPU decorator
# ------------------------------------------------------------------
import spaces                       # pip install huggingface-hub>=0.16

# ------------------------------------------------------------------
# 2.  Load model once per GPU worker
# ------------------------------------------------------------------
MODEL_ID = "openai/whisper-base"    # pick any HF whisper ckpt
MODEL = None

def _load_model():
    global MODEL
    if MODEL is None:
        # download weights from HF hub (cached)
        ckpt = hf_hub_download(repo_id=MODEL_ID, filename="pytorch_model.bin")
        MODEL = whisper.load_model("base")   # still uses same weights
    return MODEL

# ------------------------------------------------------------------
# 3.  GPU-decorated transcription
# ------------------------------------------------------------------
@spaces.GPU
def transcribe(audio):
    """
    audio: filepath (upload) or (sr, data) tuple (mic)
    returns: transcribed text
    """
    if audio is None:
        return "⚠️  No audio received."

    # ---- handle microphone ----
    if isinstance(audio, tuple):
        sr, data = audio
        data = data.astype(np.float32)
        if np.abs(data).max() > 1.0:
            data /= np.abs(data).max()
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
            import soundfile as sf
            sf.write(tmp.name, data, sr)
            audio_path = tmp.name
    else:
        audio_path = audio

    # ---- run Whisper on GPU ----
    try:
        model = _load_model()
        result = model.transcribe(audio_path, fp16=True)   # fp16 OK on GPU
        text = result["text"].strip()
        return text if text else "🤷‍♂️ No speech detected."
    except Exception as e:
        return f"❌ Error: {e}"
    finally:
        if audio_path != audio and os.path.exists(audio_path):
            os.unlink(audio_path)

# ------------------------------------------------------------------
# 4.  Gradio UI (unchanged)
# ------------------------------------------------------------------
demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources=["upload", "microphone"], type="filepath"),
    outputs=gr.Textbox(label="Transcription", lines=6),
    title="🎙️ Whisper Audio-to-Text (ZeroGPU)",
    description="Upload or record audio → instant transcription on A100.",
)

if __name__ == "__main__":
    demo.launch()