Loren commited on
Commit
8cf4656
·
verified ·
1 Parent(s): 8f57ebb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -15
app.py CHANGED
@@ -52,7 +52,31 @@ def process_translate(language: str, audio_path: str) -> str:
52
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
53
 
54
  return decoded_outputs[0]
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def disable_buttons():
58
  return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
@@ -86,25 +110,24 @@ dict_languages = {"English": "en",
86
 
87
  #### Gradio interface
88
  with gr.Blocks(title="Voxtral") as voxtral:
89
- gr.Markdown("# Voxtral Mini Evaluation")
90
- gr.Markdown("""### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
91
  capabilities while retaining best-in-class text performance.
92
- It excels at speech transcription, translation and audio understanding.""")
93
 
94
  with gr.Accordion("🔎 More on Voxtral", open=False):
95
- gr.Markdown("## Key features:")
96
  gr.Markdown("""## **Key Features:**
97
 
98
- Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
99
- - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
100
- - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
101
- - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
102
- - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
103
- - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
104
- - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
105
 
106
 
107
- gr.Markdown("## Upload an audio file, record via microphone, or select a demo file:")
108
  gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
109
 
110
  with gr.Row():
@@ -119,7 +142,10 @@ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
119
  cache_examples=False,
120
  run_on_click=False
121
  )
122
-
 
 
 
123
  with gr.Row():
124
  with gr.Column():
125
  with gr.Accordion("📝 Transcription", open=True):
@@ -144,7 +170,7 @@ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
144
 
145
  with gr.Column():
146
  with gr.Accordion("🤖 Ask audio file", open=True):
147
- question = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
148
  submit_chat = gr.Button("Ask audio file:", variant="primary")
149
  text_chat = gr.Textbox(label="💬 Model answer", lines=10)
150
 
@@ -178,7 +204,20 @@ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
178
  outputs=[submit_transcript, submit_translate, submit_chat],
179
  )
180
 
181
-
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  ### Launch the app
183
 
184
  if __name__ == "__main__":
 
52
  decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
53
 
54
  return decoded_outputs[0]
55
+ ###
56
 
57
+ @spaces.GPU
58
+ def process_chat(question: str, audio_path: str) -> str:
59
+ conversation = [
60
+ {
61
+ "role": "user",
62
+ "content": [
63
+ {
64
+ "type": "audio",
65
+ "path": audio_path,
66
+ },
67
+ {"type": "text", "text": question},
68
+ ],
69
+ }
70
+ ]
71
+
72
+ inputs = processor.apply_chat_template(conversation)
73
+ inputs = inputs.to(device, dtype=torch.bfloat16)
74
+
75
+ outputs = model.generate(**inputs, max_new_tokens=500)
76
+ decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
77
+
78
+ return decoded_outputs[0]
79
+ ###
80
 
81
  def disable_buttons():
82
  return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
 
110
 
111
  #### Gradio interface
112
  with gr.Blocks(title="Voxtral") as voxtral:
113
+ gr.Markdown("# **Voxtral Mini Evaluation**")
114
+ gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
115
  capabilities while retaining best-in-class text performance.
116
+ #### It excels at speech transcription, translation and audio understanding.""")
117
 
118
  with gr.Accordion("🔎 More on Voxtral", open=False):
 
119
  gr.Markdown("""## **Key Features:**
120
 
121
+ #### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
122
+ #### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
123
+ #### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
124
+ #### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
125
+ #### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
126
+ #### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
127
+ #### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
128
 
129
 
130
+ gr.Markdown("### **1. Upload an audio file, record via microphone, or select a demo file:**")
131
  gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
132
 
133
  with gr.Row():
 
142
  cache_examples=False,
143
  run_on_click=False
144
  )
145
+
146
+ with gr.Row():
147
+ gr.Markdown("### **2. Choose one of theese tasks:**")
148
+
149
  with gr.Row():
150
  with gr.Column():
151
  with gr.Accordion("📝 Transcription", open=True):
 
170
 
171
  with gr.Column():
172
  with gr.Accordion("🤖 Ask audio file", open=True):
173
+ question_chat = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
174
  submit_chat = gr.Button("Ask audio file:", variant="primary")
175
  text_chat = gr.Textbox(label="💬 Model answer", lines=10)
176
 
 
204
  outputs=[submit_transcript, submit_translate, submit_chat],
205
  )
206
 
207
+ # Chat
208
+ submit_chat.click(
209
+ disable_buttons,
210
+ outputs=[submit_transcript, submit_translate, submit_chat],
211
+ trigger_mode="once",
212
+ ).then(
213
+ fn=process_chat,
214
+ inputs=[question_chat, sel_audio],
215
+ outputs=text_chat
216
+ ).then(
217
+ enable_buttons,
218
+ outputs=[submit_transcript, submit_translate, submit_chat],
219
+ )
220
+
221
  ### Launch the app
222
 
223
  if __name__ == "__main__":