MohamedRashad commited on
Commit
f0d5b79
·
1 Parent(s): ede6b30

Add initial implementation of Voxtral audio processing app with Gradio interface

Browse files
Files changed (2) hide show
  1. app.py +96 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import gradio as gr
4
+ import spaces
5
+ import torch
6
+ from huggingface_hub import snapshot_download
7
+ from transformers import AutoProcessor, VoxtralForConditionalGeneration
8
+
9
+ # Model paths and setup
10
+ voxtral_mini_path = snapshot_download(
11
+ repo_id='mistralai/Voxtral-Mini-3B-2507',
12
+ revision='refs/pr/16',
13
+ local_dir=Path(__file__).parent / 'Voxtral-Mini-3B-2507',
14
+ resume_download=True,
15
+ )
16
+ print(f"Voxtral Mini model downloaded to: {voxtral_mini_path}")
17
+
18
+ voxtral_small_path = snapshot_download(
19
+ repo_id='mistralai/Voxtral-Small-24B-2507',
20
+ revision='refs/pr/9',
21
+ local_dir=Path(__file__).parent / 'Voxtral-Small-24B-2507',
22
+ resume_download=True,
23
+ )
24
+ print(f"Voxtral Small model downloaded to: {voxtral_small_path}")
25
+
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ print(f"Using device: {device}")
28
+
29
+ # Load model and processor
30
+ voxtral_mini_processor = AutoProcessor.from_pretrained(voxtral_mini_path)
31
+ voxtral_mini_model = VoxtralForConditionalGeneration.from_pretrained(voxtral_mini_path, torch_dtype=torch.bfloat16, device_map=device)
32
+
33
+ voxtral_small_processor = AutoProcessor.from_pretrained(voxtral_small_path)
34
+ voxtral_small_model = VoxtralForConditionalGeneration.from_pretrained(voxtral_small_path, torch_dtype=torch.bfloat16, device_map=device)
35
+
36
+ @spaces.GPU()
37
+ def process_audio(audio_path, model_name, language="en", max_tokens=500):
38
+ """Process audio with selected Voxtral model and return the generated response"""
39
+ if not audio_path:
40
+ return "Please upload an audio file."
41
+
42
+ if model_name == "Voxtral Mini (3B)":
43
+ model = voxtral_mini_model
44
+ processor = voxtral_mini_processor
45
+ repo_id = str(voxtral_mini_path)
46
+ elif model_name == "Voxtral Small (24B)":
47
+ model = voxtral_small_model
48
+ processor = voxtral_small_processor
49
+ repo_id = str(voxtral_small_path)
50
+ else:
51
+ return "Invalid model selected."
52
+
53
+ inputs = processor.apply_transcription_request(language=language, audio=audio_path, model_id=repo_id)
54
+ inputs = inputs.to(device, dtype=torch.bfloat16)
55
+
56
+ outputs = model.generate(**inputs, max_new_tokens=max_tokens)
57
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
58
+
59
+ return decoded_outputs[0]
60
+
61
+ # Define Gradio interface
62
+ with gr.Blocks(title="Voxtral Demo") as demo:
63
+ gr.Markdown("# Voxtral Audio Processing Demo")
64
+ gr.Markdown("Upload an audio file and get a transcription/response from Voxtral.")
65
+
66
+ with gr.Row():
67
+ with gr.Column():
68
+ audio_input = gr.Audio(type="filepath", label="Upload Audio")
69
+
70
+ model_selector = gr.Dropdown(
71
+ choices=["Voxtral Mini (3B)", "Voxtral Small (24B)"],
72
+ value="Voxtral Mini (3B)",
73
+ label="Select Model"
74
+ )
75
+
76
+ language = gr.Dropdown(
77
+ choices=["en", "fr", "de", "es", "it", "pt", "nl", "ru", "zh", "ja", "ar"],
78
+ value="en",
79
+ label="Language"
80
+ )
81
+
82
+ max_tokens = gr.Slider(minimum=50, maximum=1000, value=500, step=50, label="Max Output Tokens")
83
+ submit_btn = gr.Button("Process Audio")
84
+
85
+ with gr.Column():
86
+ output_text = gr.Textbox(label="Generated Response", lines=10)
87
+
88
+ submit_btn.click(
89
+ fn=process_audio,
90
+ inputs=[audio_input, model_selector, language, max_tokens],
91
+ outputs=output_text
92
+ )
93
+
94
+ # Launch the app
95
+ if __name__ == "__main__":
96
+ demo.queue().launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers
2
+ gradio
3
+ torch
4
+ spaces