import torch import torchaudio import tempfile import requests from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import gradio as gr model_name = "ibm-granite/granite-speech-3.3-8b" device = "cuda" if torch.cuda.is_available() else "cpu" processor = AutoProcessor.from_pretrained(model_name) model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device) def download_audio_from_url(url): response = requests.get(url) if response.status_code != 200: raise Exception("Failed to download file from URL.") tmp = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) tmp.write(response.content) tmp.close() waveform, sr = torchaudio.load(tmp.name) return waveform, sr def transcribe_from_url(audio_url, translate_to=None): waveform, sr = download_audio_from_url(audio_url) # Resample if needed if sr != 16000: waveform = torchaudio.functional.resample(waveform, sr, 16000) inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").to(device) outputs = model.generate(**inputs, num_beams=5, max_new_tokens=512) text = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] if translate_to: text = f"<|translate_to={translate_to}|> " + text inputs2 = processor(text, return_tensors="pt").to(device) outputs2 = model.generate(**inputs2, num_beams=5) text = processor.tokenizer.batch_decode(outputs2, skip_special_tokens=True)[0] return text gr.Interface( fn=transcribe_from_url, inputs=[ gr.Textbox(label="🎧 Audio File URL (.mp3, .wav)", placeholder="Paste Google Drive direct link or other audio URL"), gr.Dropdown(choices=[None, "fr", "es", "it", "de", "pt", "ja", "zh"], label="Translate to (optional)") ], outputs=gr.Textbox(label="📝 Transcription / Translation"), title="Granite Speech 3.3-8B - Audio from URL", description="Paste a direct URL to an audio file (Google Drive with 'uc?export=download' format or any MP3/WAV link)" ).launch()