Voxtral_Mini_Evaluation

Running

App Files Files Community

Loren commited on Jul 25

Commit

8cf4656

verified ·

1 Parent(s): 8f57ebb

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -15

app.py CHANGED Viewed

@@ -52,7 +52,31 @@ def process_translate(language: str, audio_path: str) -> str:
     decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
     return decoded_outputs[0]
 def disable_buttons():
     return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
@@ -86,25 +110,24 @@ dict_languages = {"English": "en",
 #### Gradio interface
 with gr.Blocks(title="Voxtral") as voxtral:
-    gr.Markdown("# Voxtral Mini Evaluation")
-    gr.Markdown("""### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
     capabilities while retaining best-in-class text performance.
-    It excels at speech transcription, translation and audio understanding.""")
     with gr.Accordion("🔎 More on Voxtral", open=False):
-        gr.Markdown("## Key features:")
         gr.Markdown("""## **Key Features:**
-Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
-- **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
-- **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
-- **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
-- **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
-- **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
-- **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
-    gr.Markdown("## Upload an audio file, record via microphone, or select a demo file:")
     gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
     with gr.Row():
@@ -119,7 +142,10 @@ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
             cache_examples=False,
             run_on_click=False
         )
     with gr.Row():
         with gr.Column():
             with gr.Accordion("📝 Transcription", open=True):
@@ -144,7 +170,7 @@ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
         with gr.Column():
             with gr.Accordion("🤖 Ask audio file", open=True):
-                question = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
                 submit_chat = gr.Button("Ask audio file:", variant="primary")
                 text_chat = gr.Textbox(label="💬 Model answer", lines=10)
@@ -178,7 +204,20 @@ Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
         outputs=[submit_transcript, submit_translate, submit_chat],
     )
 ### Launch the app
 if __name__ == "__main__":

     decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
     return decoded_outputs[0]
+###
+@spaces.GPU
+def process_chat(question: str, audio_path: str) -> str:
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio",
+                    "path": audio_path,
+                },
+                {"type": "text", "text": question},
+            ],
+        }
+    ]
+    inputs = processor.apply_chat_template(conversation)
+    inputs = inputs.to(device, dtype=torch.bfloat16)
+    outputs = model.generate(**inputs, max_new_tokens=500)
+    decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+    return decoded_outputs[0]
+###
 def disable_buttons():
     return gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)
 #### Gradio interface
 with gr.Blocks(title="Voxtral") as voxtral:
+    gr.Markdown("# **Voxtral Mini Evaluation**")
+    gr.Markdown("""#### Voxtral Mini is an enhancement of **Ministral 3B**, incorporating state-of-the-art audio input \
     capabilities while retaining best-in-class text performance.
+    #### It excels at speech transcription, translation and audio understanding.""")
     with gr.Accordion("🔎 More on Voxtral", open=False):
         gr.Markdown("""## **Key Features:**
+#### Voxtral builds upon Ministral-3B with powerful audio understanding capabilities.
+#### - **Dedicated transcription mode**: Voxtral can operate in a pure speech transcription mode to maximize performance. By default, Voxtral automatically predicts the source audio language and transcribes the text accordingly
+#### - **Long-form context**: With a 32k token context length, Voxtral handles audios up to 30 minutes for transcription, or 40 minutes for understanding
+#### - **Built-in Q&A and summarization**: Supports asking questions directly through audio. Analyze audio and generate structured summaries without the need for separate ASR and language models
+#### - **Natively multilingual**: Automatic language detection and state-of-the-art performance in the world’s most widely used languages (English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)
+#### - **Function-calling straight from voice**: Enables direct triggering of backend functions, workflows, or API calls based on spoken user intents
+#### - **Highly capable at text**: Retains the text understanding capabilities of its language model backbone, Ministral-3B""")
+    gr.Markdown("### **1. Upload an audio file, record via microphone, or select a demo file:**")
     gr.Markdown("### *(Voxtral handles audios up to 30 minutes for transcription)*")
     with gr.Row():
             cache_examples=False,
             run_on_click=False
         )
+    with gr.Row():
+        gr.Markdown("### **2. Choose one of theese tasks:**")
     with gr.Row():
         with gr.Column():
             with gr.Accordion("📝 Transcription", open=True):
         with gr.Column():
             with gr.Accordion("🤖 Ask audio file", open=True):
+                question_chat = gr.Textbox(label="Ask audio file", placeholder="Enter your question about audio file")
                 submit_chat = gr.Button("Ask audio file:", variant="primary")
                 text_chat = gr.Textbox(label="💬 Model answer", lines=10)
         outputs=[submit_transcript, submit_translate, submit_chat],
     )
+    # Chat
+    submit_chat.click(
+        disable_buttons,
+        outputs=[submit_transcript, submit_translate, submit_chat],
+        trigger_mode="once",
+    ).then(
+        fn=process_chat,
+        inputs=[question_chat, sel_audio],
+        outputs=text_chat
+    ).then(
+        enable_buttons,
+        outputs=[submit_transcript, submit_translate, submit_chat],
+    )
 ### Launch the app
 if __name__ == "__main__":