Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,6 @@ import gradio as gr
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoProcessor, VoxtralForConditionalGeneration
|
| 4 |
import spaces
|
| 5 |
-
from gradio_modal import Modal
|
| 6 |
|
| 7 |
#### Functions
|
| 8 |
|
|
@@ -119,12 +118,12 @@ with gr.Blocks(title="Voxtral") as voxtral:
|
|
| 119 |
gr.Markdown("""## **Key Features:**
|
| 120 |
|
| 121 |
#### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
|
| 129 |
|
| 130 |
gr.Markdown("### **1. Upload an audio file, record via microphone, or select a demo file:**")
|
|
@@ -170,7 +169,7 @@ with gr.Blocks(title="Voxtral") as voxtral:
|
|
| 170 |
|
| 171 |
with gr.Column():
|
| 172 |
with gr.Accordion("🤖 Ask audio file", open=True):
|
| 173 |
-
question_chat = gr.Textbox(label="
|
| 174 |
submit_chat = gr.Button("Ask audio file:", variant="primary")
|
| 175 |
text_chat = gr.Textbox(label="💬 Model answer", lines=10)
|
| 176 |
|
|
|
|
| 2 |
import torch
|
| 3 |
from transformers import AutoProcessor, VoxtralForConditionalGeneration
|
| 4 |
import spaces
|
|
|
|
| 5 |
|
| 6 |
#### Functions
|
| 7 |
|
|
|
|
| 118 |
gr.Markdown("""## **Key Features:**
|
| 119 |
|
| 120 |
#### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
|
| 121 |
+
##### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
|
| 122 |
+
##### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
|
| 123 |
+
##### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
|
| 124 |
+
##### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
|
| 125 |
+
##### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
|
| 126 |
+
##### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
|
| 127 |
|
| 128 |
|
| 129 |
gr.Markdown("### **1. Upload an audio file, record via microphone, or select a demo file:**")
|
|
|
|
| 169 |
|
| 170 |
with gr.Column():
|
| 171 |
with gr.Accordion("🤖 Ask audio file", open=True):
|
| 172 |
+
question_chat = gr.Textbox(label="Enter your question about audio file:", placeholder="Enter your question about audio file")
|
| 173 |
submit_chat = gr.Button("Ask audio file:", variant="primary")
|
| 174 |
text_chat = gr.Textbox(label="💬 Model answer", lines=10)
|
| 175 |
|