Audio2Text / app.py
IFMedTechdemo's picture
Create app.py
c72ac42 verified
"""
Whisper Audio-to-Text – ZeroGPU edition
Runs on πŸ€— Spaces with ZeroGPU (A100) accelerator
"""
import os
import tempfile
import gradio as gr
import whisper
import numpy as np
from huggingface_hub import hf_hub_download
# ------------------------------------------------------------------
# 1. ZeroGPU decorator
# ------------------------------------------------------------------
import spaces # pip install huggingface-hub>=0.16
# ------------------------------------------------------------------
# 2. Load model once per GPU worker
# ------------------------------------------------------------------
MODEL_ID = "openai/whisper-base" # pick any HF whisper ckpt
MODEL = None
def _load_model():
global MODEL
if MODEL is None:
# download weights from HF hub (cached)
ckpt = hf_hub_download(repo_id=MODEL_ID, filename="pytorch_model.bin")
MODEL = whisper.load_model("base") # still uses same weights
return MODEL
# ------------------------------------------------------------------
# 3. GPU-decorated transcription
# ------------------------------------------------------------------
@spaces.GPU
def transcribe(audio):
"""
audio: filepath (upload) or (sr, data) tuple (mic)
returns: transcribed text
"""
if audio is None:
return "⚠️ No audio received."
# ---- handle microphone ----
if isinstance(audio, tuple):
sr, data = audio
data = data.astype(np.float32)
if np.abs(data).max() > 1.0:
data /= np.abs(data).max()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
import soundfile as sf
sf.write(tmp.name, data, sr)
audio_path = tmp.name
else:
audio_path = audio
# ---- run Whisper on GPU ----
try:
model = _load_model()
result = model.transcribe(audio_path, fp16=True) # fp16 OK on GPU
text = result["text"].strip()
return text if text else "πŸ€·β€β™‚οΈ No speech detected."
except Exception as e:
return f"❌ Error: {e}"
finally:
if audio_path != audio and os.path.exists(audio_path):
os.unlink(audio_path)
# ------------------------------------------------------------------
# 4. Gradio UI (unchanged)
# ------------------------------------------------------------------
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["upload", "microphone"], type="filepath"),
outputs=gr.Textbox(label="Transcription", lines=6),
title="πŸŽ™οΈ Whisper Audio-to-Text (ZeroGPU)",
description="Upload or record audio β†’ instant transcription on A100.",
)
if __name__ == "__main__":
demo.launch()