File size: 1,575 Bytes
0ad12f7
e64c5ed
0ad12f7
e64c5ed
 
0ad12f7
 
 
 
 
 
 
 
 
 
 
 
e64c5ed
0ad12f7
e64c5ed
 
0ad12f7
e64c5ed
 
0ad12f7
e64c5ed
0ad12f7
e64c5ed
0ad12f7
 
 
 
e64c5ed
 
 
 
 
0ad12f7
e64c5ed
 
0ad12f7
e64c5ed
 
0ad12f7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import os
import gradio as gr
import torch
from transformers import pipeline

MODEL_ID = "openai/whisper-small"

def load_asr():
    # Prefer GPU if available, else CPU. For transformers pipelines:
    # device: int index for CUDA, or -1 for CPU.
    device = 0 if torch.cuda.is_available() else -1
    print(f"🎤 Loading transcription pipeline on {'GPU' if device == 0 else 'CPU'}...")
    return pipeline(
        task="automatic-speech-recognition",
        model=MODEL_ID,
        device=device
    )

asr = load_asr()

def transcribe_audio(audio_file_path):
    if not audio_file_path:
        return "Please upload an audio file."
    if not os.path.exists(audio_file_path):
        return f"Error: file not found at {audio_file_path}"

    print(f"→ Transcribing: {audio_file_path}")
    try:
        # chunk_length_s works with Whisper in transformers
        result = asr(audio_file_path, chunk_length_s=30, return_timestamps=True)
        # result is a dict with "text" and possibly "chunks"
        return result.get("text", "").strip() or "(No text recognized)"
    except Exception as e:
        return f"Error during transcription: {e}"

iface = gr.Interface(
    fn=transcribe_audio,
    inputs=gr.Audio(type="filepath", label="Upload audio (MP3/WAV)"),
    outputs=gr.Textbox(label="Transcription"),
    title="Audio Transcription Pipeline",
    description="Upload an audio file and get a Whisper-small transcription.",
)

if __name__ == "__main__":
    # Bind to all interfaces for Docker/Spaces
    iface.launch(server_name="0.0.0.0", server_port=7860)