talk-to-smolvox

Running on T4

App Files Files Community

Steveeeeeeen HF Staff commited on Feb 14

Commit

a7e49b4

verified ·

1 Parent(s): 691f0df

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -39

app.py CHANGED Viewed

@@ -6,8 +6,6 @@ from twilio.rest import Client
 import os
 import torch
 import librosa
-import spaces
 pipe = transformers.pipeline(
     model="reach-vb/smolvox-smollm2-whisper-turbo",
@@ -23,9 +21,7 @@ auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
 if account_sid and auth_token:
     client = Client(account_sid, auth_token)
     token = client.tokens.create()
     rtc_configuration = {
         "iceServers": token.ice_servers,
         "iceTransportPolicy": "relay",
@@ -33,12 +29,8 @@ if account_sid and auth_token:
 else:
     rtc_configuration = None
-@spaces.GPU(duration=90)
-def transcribe(
-    audio: tuple[int, np.ndarray],
-    transformers_chat: list[dict],
-    conversation: list[dict],
-):
     original_sr = audio[0]
     target_sr = 16000
@@ -48,7 +40,7 @@ def transcribe(
     tf_input = [d for d in transformers_chat]
-    # Generate response from the pipeline using the audio input
     output = pipe(
         {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
         max_new_tokens=512,
@@ -64,22 +56,16 @@ def transcribe(
     yield AdditionalOutputs(transformers_chat, conversation)
-@spaces.GPU(duration=90)
-def respond_text(
-    user_text: str,
-    transformers_chat: list[dict],
-    conversation: list[dict],
-):
     if not user_text.strip():
-        # Do nothing if the textbox is empty
         return transformers_chat, conversation
     # Append the user message from the textbox
     conversation.append({"role": "user", "content": user_text})
     transformers_chat.append({"role": "user", "content": user_text})
-    # Generate a response using the pipeline.
-    # Here we assume the pipeline can also process text input via the "text" key.
     output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)
     conversation.append({"role": "assistant", "content": output})
@@ -90,18 +76,19 @@ def respond_text(
 with gr.Blocks() as demo:
     gr.HTML(
         """
-        <h1 style='text-align: center'>
-            Talk to Smolvox Smollm2 (Powered by WebRTC ⚡️)
-        </h1>
-        <p style='text-align: center'>
-            Once you grant access to your microphone, you can talk naturally to Ultravox.
-            When you stop talking, the audio will be sent for processing.
-        </p>
-        <p style='text-align: center'>
-            Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
-        </p>
-        """
     )
     # Shared conversation state
     transformers_chat = gr.State(
         value=[
@@ -112,13 +99,15 @@ with gr.Blocks() as demo:
         ]
     )
     with gr.Row():
         with gr.Column(scale=1):
-            transcript = gr.Chatbot(label="Transcript", type="messages")
             text_input = gr.Textbox(
-                placeholder="Type your message here...", label="Your Message"
             )
-            send_button = gr.Button("Send")
         with gr.Column(scale=1):
             audio = WebRTC(
                 rtc_configuration=rtc_configuration,
@@ -127,7 +116,7 @@ with gr.Blocks() as demo:
                 modality="audio",
             )
-    # Audio stream: when you stop speaking, process the audio input.
     audio.stream(
         ReplyOnPause(transcribe),
         inputs=[audio, transformers_chat, transcript],
@@ -141,14 +130,14 @@ with gr.Blocks() as demo:
         show_progress="hidden",
     )
-    # Text input: when you click "Send", process the typed message.
-    send_button.click(
         respond_text,
         inputs=[text_input, transformers_chat, transcript],
         outputs=[transformers_chat, transcript],
     )
-    # Optionally clear the text box after sending:
-    send_button.click(lambda: "", inputs=[], outputs=[text_input])
 if __name__ == "__main__":
     demo.launch()

 import os
 import torch
 import librosa
 pipe = transformers.pipeline(
     model="reach-vb/smolvox-smollm2-whisper-turbo",
 if account_sid and auth_token:
     client = Client(account_sid, auth_token)
     token = client.tokens.create()
     rtc_configuration = {
         "iceServers": token.ice_servers,
         "iceTransportPolicy": "relay",
 else:
     rtc_configuration = None
+def transcribe(audio: tuple[int, np.ndarray], transformers_chat: list[dict], conversation: list[dict]):
     original_sr = audio[0]
     target_sr = 16000
     tf_input = [d for d in transformers_chat]
+    # Generate a response from the pipeline using the audio input
     output = pipe(
         {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
         max_new_tokens=512,
     yield AdditionalOutputs(transformers_chat, conversation)
+def respond_text(user_text: str, transformers_chat: list[dict], conversation: list[dict]):
     if not user_text.strip():
         return transformers_chat, conversation
     # Append the user message from the textbox
     conversation.append({"role": "user", "content": user_text})
     transformers_chat.append({"role": "user", "content": user_text})
+    # Generate a response using the pipeline. We assume it can process text input via "text"
     output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)
     conversation.append({"role": "assistant", "content": output})
 with gr.Blocks() as demo:
     gr.HTML(
         """
+    <h1 style='text-align: center'>
+    Talk to Smolvox Smollm2 1.7b (Powered by WebRTC ⚡️)
+    </h1>
+    <p style='text-align: center'>
+    Once you grant access to your microphone, you can talk naturally to Ultravox.
+    When you stop talking, the audio will be sent for processing.
+    </p>
+    <p style='text-align: center'>
+    Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
+    </p>
+    """
     )
     # Shared conversation state
     transformers_chat = gr.State(
         value=[
         ]
     )
+    # Chat transcript at the top
+    transcript = gr.Chatbot(label="Transcript", type="messages")
+    # Lower row: text input and audio input side by side
     with gr.Row():
         with gr.Column(scale=1):
             text_input = gr.Textbox(
+                placeholder="Type your message here and press Enter...", label="Your Message"
             )
         with gr.Column(scale=1):
             audio = WebRTC(
                 rtc_configuration=rtc_configuration,
                 modality="audio",
             )
+    # Audio stream: process audio when speaking stops.
     audio.stream(
         ReplyOnPause(transcribe),
         inputs=[audio, transformers_chat, transcript],
         show_progress="hidden",
     )
+    # Text input: submit callback when pressing Enter.
+    text_input.submit(
         respond_text,
         inputs=[text_input, transformers_chat, transcript],
         outputs=[transformers_chat, transcript],
     )
+    # Clear text input after submission.
+    text_input.submit(lambda: "", inputs=[], outputs=[text_input])
 if __name__ == "__main__":
     demo.launch()