Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import AutoProcessor, VoxtralForConditionalGeneration | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {device}") | |
| # Load model and processor | |
| voxtral_mini_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers") | |
| voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Mini-3B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device) | |
| voxtral_small_processor = AutoProcessor.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers") | |
| voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained("MohamedRashad/Voxtral-Small-24B-2507-transformers", torch_dtype=torch.bfloat16, device_map=device) | |
| LANGUAGES = { | |
| "English": "en", | |
| "French": "fr", | |
| "German": "de", | |
| "Spanish": "es", | |
| "Italian": "it", | |
| "Portuguese": "pt", | |
| "Dutch": "nl", | |
| "Russian": "ru", | |
| "Chinese": "zh", | |
| "Japanese": "ja", | |
| "Arabic": "ar", | |
| } | |
| def process_audio(audio_path, model_name, lang_name, max_tokens=500): | |
| """Process audio with selected Voxtral model and return the generated response. | |
| This function takes an audio file and processes it using the selected Voxtral model | |
| to generate a transcription in the specified language. | |
| Args: | |
| audio_path: Path to the audio file to be transcribed. | |
| model_name: Name of the Voxtral model to use ("Voxtral Mini (3B)" or "Voxtral Small (24B)"). | |
| lang_name: Name of the language for transcription (e.g., "English", "French", etc.). | |
| max_tokens: Maximum number of tokens to generate in the output (default: 500). | |
| Returns: | |
| String containing the transcribed text from the audio file, or an error message | |
| if the audio file is missing or an invalid model is selected. | |
| """ | |
| if not audio_path: | |
| return "Please upload an audio file." | |
| if model_name == "Voxtral Mini (3B)": | |
| model = voxtral_mini_model | |
| processor = voxtral_mini_processor | |
| repo_id = "MohamedRashad/Voxtral-Mini-3B-2507-transformers" | |
| elif model_name == "Voxtral Small (24B)": | |
| model = voxtral_small_model | |
| processor = voxtral_small_processor | |
| repo_id = "MohamedRashad/Voxtral-Small-24B-2507-transformers" | |
| else: | |
| return "Invalid model selected." | |
| language = LANGUAGES[lang_name] | |
| inputs = processor.apply_transcrition_request(language=language, audio=audio_path, model_id=repo_id) | |
| inputs = inputs.to(device, dtype=torch.bfloat16) | |
| outputs = model.generate(**inputs, max_new_tokens=max_tokens) | |
| decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| return decoded_outputs[0] | |
| # Define Gradio interface | |
| with gr.Blocks(title="Voxtral Demo") as demo: | |
| gr.Markdown("# Voxtral Transcription Demo") | |
| gr.Markdown("Upload an audio file and get a transcription from Voxtral.") | |
| gr.Markdown("You can find the `transformers` version of Voxtral here: [3B](https://huggingface.co/MohamedRashad/Voxtral-Mini-3B-2507-transformers), [24B](https://huggingface.co/MohamedRashad/Voxtral-Small-24B-2507-transformers)") | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio(type="filepath", label="Upload Audio") | |
| model_selector = gr.Dropdown( | |
| choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"], | |
| value="Voxtral Mini (3B)", | |
| label="Select Model" | |
| ) | |
| language = gr.Dropdown( | |
| choices=list(LANGUAGES.keys()), | |
| value="English", | |
| label="Language" | |
| ) | |
| max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens") | |
| submit_btn = gr.Button("Extract Transcription", variant="primary") | |
| with gr.Column(): | |
| output_text = gr.Textbox(label="Generated Response", lines=10) | |
| submit_btn.click( | |
| fn=process_audio, | |
| inputs=[audio_input, model_selector, language, max_tokens], | |
| outputs=output_text | |
| ) | |
| gr.Examples( | |
| examples=[ | |
| ["examples/english_armstrong_small_step.mp3", "Voxtral Mini (3B)", "English", 500], | |
| ["examples/french_mathis_voice_intro.mp3", "Voxtral Mini (3B)", "French", 500], | |
| ["examples/german_spehr_voice_intro.mp3", "Voxtral Mini (3B)", "German", 500], | |
| ["examples/japanese_ann01_announcement.mp3", "Voxtral Mini (3B)", "Japanese", 500], | |
| ["examples/arabic_news_report.mp3", "Voxtral Mini (3B)", "Arabic", 500], | |
| ["examples/arabic_yousif_saif_football.mp3", "Voxtral Small (24B)", "Arabic", 500], | |
| ], | |
| inputs=[audio_input, model_selector, language, max_tokens], | |
| example_labels=[ | |
| "Neil Armstrong's 'small step' (English, 24s)", | |
| "Rémi Mathis voice intro (French, 16s)", | |
| "Christoph Spehr voice intro (German, 28s)", | |
| "Ann01 announcement (Japanese, 22s)", | |
| "News Report (Arabic, 10s)", | |
| "Football Commentry (Arabic, 11s)", | |
| ] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.queue().launch(share=False, ssr_mode=False, mcp_server=True) |