Spaces:

XiaomiMiMo
/

mimo_audio_chat

Running on CPU Upgrade

yanyihan-xiaomi commited on Sep 25

Commit

178417b

1 Parent(s): 966a9a2

Add Dockerfile and implement WebRTC functionality

- Created Dockerfile for environment setup.
- Added WebRTC handling in mimo_webrtc.py.
- Updated requirements.txt for new dependencies.
- Enhanced .gitignore for better file management.

Files changed (6) hide show

.gitignore +4 -2
Dockerfile +26 -0
README.md +4 -7
app.py → mimo_webrtc.py +32 -102
requirements.txt +4 -3
webrtc_vad.py +192 -0

.gitignore CHANGED Viewed

@@ -1,2 +1,4 @@
-**/__pycache__/**
-**/tmp/**

+__pycache__
+tmp
+.venv
+.vscode

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+FROM python:3.12-slim
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+    ca-certificates \
+    curl \
+    libc++1 \
+    ffmpeg \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+ENV HOME=/home/user
+ENV PATH="$HOME/.local/bin:$PATH"
+USER user
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN python3 -m pip install --no-cache-dir --upgrade pip \
+    && python3 -m pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+CMD ["python3.12", "-u", "mimo_webrtc.py"]

README.md CHANGED Viewed

@@ -1,13 +1,10 @@
 ---
 title: MiMo-Audio-Chat
-emoji: 🚀
 colorFrom: yellow
 colorTo: indigo
-sdk: gradio
-sdk_version: 5.44.1
-app_file: app.py
-pinned: false
-python_version: 3.12.7
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: MiMo-Audio-Chat
+emoji: 💬
 colorFrom: yellow
 colorTo: indigo
+sdk: docker
+app_port: 8087
 ---
+Check out the configuration reference at <https://huggingface.co/docs/hub/spaces-config-reference>

app.py → mimo_webrtc.py RENAMED Viewed

@@ -3,11 +3,12 @@ import queue
 import random
 import time
 from threading import Thread
-from typing import Any, Callable, Literal, override
 import fastrtc
 import gradio as gr
 import httpx
 import numpy as np
 from api_schema import (
@@ -22,6 +23,7 @@ from api_schema import (
     TokenizedConversation,
     TokenizedMessage,
 )
 HF_TOKEN = os.getenv("HF_TOKEN")
 SERVER_LIST = os.getenv("SERVER_LIST")
@@ -66,7 +68,7 @@ def auth_headers() -> dict[str, str]:
 def get_cloudflare_turn_credentials(
-    ttl: int = 1200,  # 20 minutes
 ) -> dict[str, Any]:
     with httpx.Client() as client:
         response = client.post(
@@ -85,74 +87,6 @@ def get_cloudflare_turn_credentials(
             )
-class NeverVAD(fastrtc.PauseDetectionModel):
-    def vad(self, *_args, **_kwargs):
-        raise RuntimeError("NeverVAD should not be called.")
-    def warmup(self):
-        pass
-class ReplyOnMuted(fastrtc.ReplyOnPause):
-    def __init__(
-        self,
-        fn: fastrtc.reply_on_pause.ReplyFnGenerator,
-        startup_fn: Callable | None = None,
-        can_interrupt: bool = True,
-        needs_args: bool = False,
-    ):
-        super().__init__(
-            fn,
-            startup_fn,
-            None,
-            None,
-            can_interrupt,
-            "mono",
-            24000,
-            None,
-            24000,
-            NeverVAD(),
-            needs_args,
-        )
-    def copy(self):
-        return ReplyOnMuted(
-            self.fn,
-            self.startup_fn,
-            self.can_interrupt,
-            self.needs_args,
-        )
-    def determine_pause(
-        self,
-        audio: np.ndarray,  # shape [samples,]
-        sampling_rate: int,
-        state: fastrtc.reply_on_pause.AppState,
-    ):
-        chunk_length = len(audio) / sampling_rate
-        if chunk_length > 0.1:
-            state.buffer = None
-            if not state.started_talking:
-                if not np.all(abs(audio) < 5):
-                    state.started_talking = True
-                    self.send_message_sync(
-                        fastrtc.utils.create_message("log", "started_talking")
-                    )
-            if state.started_talking:
-                if state.stream is None:
-                    state.stream = audio
-                else:
-                    state.stream = np.concatenate((state.stream, audio))
-                current_duration = len(state.stream) / sampling_rate
-                if current_duration > 1.0:
-                    last_segment = state.stream[-int(sampling_rate * 0.1) :]
-                    if np.all(abs(last_segment) < 5):
-                        return True
-        return False
 class ConversationManager:
     def __init__(self, assistant_style: AssistantStyle | None = None):
         self.conversation = TokenizedConversation(messages=[])
@@ -171,7 +105,15 @@ class ConversationManager:
     def append_audio_chunk(self, audio_chunk: tuple[int, np.ndarray]):
         sr, audio_data = audio_chunk
-        assert sr == 24000, "Only 24kHz audio is supported"
         if audio_data.ndim > 1:
             # [channels, samples] -> [samples,]
             # Not Gradio style
@@ -185,7 +127,6 @@ class ConversationManager:
     def chat(
         self,
-        url: httpx.URL,
         chat_id: int,
         input_audio: tuple[int, np.ndarray],
         global_sampler_config: SamplerConfig | None = None,
@@ -195,6 +136,7 @@ class ConversationManager:
         chat_queue = queue.Queue[ChatResponseItem | None]()
         def chat_task():
             req = ChatRequestBody(
                 conversation=self.conversation,
                 input_audio=ChatAudioBytes.from_audio(input_audio),
@@ -204,15 +146,11 @@ class ConversationManager:
             )
             first_output = True
             with httpx.Client() as client:
-                headers = {
-                    "Content-Type": "application/json",
-                    "Authorization": f"Bearer {HF_TOKEN}",  # <-- 加这一行
-                }
                 with client.stream(
                     method="POST",
                     url=url,
                     content=req.model_dump_json(),
-                    headers=headers,
                 ) as response:
                     if response.status_code != 200:
                         raise RuntimeError(f"Error {response.status_code}")
@@ -270,19 +208,6 @@ class ConversationManager:
                 yield None
-def get_microphone_svg(muted: bool | None = None):
-    muted_svg = '<line x1="1" y1="1" x2="23" y2="23"></line>' if muted else ""
-    return f"""
-    <svg xmlns="http://www.w3.org/2000/svg" width="1em" height="1em" fill="none" viewBox="0 0 24 24" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round" class="feather feather-mic" style="display: inline; vertical-align: middle;">
-        <path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"></path>
-        <path d="M19 10v2a7 7 0 0 1-14 0v-2"></path>
-        <line x1="12" y1="19" x2="12" y2="23"></line>
-        <line x1="8" y1="23" x2="16" y2="23"></line>
-        {muted_svg}
-    </svg>
-    """
 class ConversationAbortController(AbortController):
     manager: ConversationManager
     cur_turn: int | None
@@ -309,6 +234,20 @@ def new_chat_id():
     return chat_id
 def main():
     print("Starting WebRTC server")
@@ -401,9 +340,7 @@ def main():
         yield additional_outputs()
         try:
-            url = chat_server_url("/audio-chat")
             for chunk in manager.chat(
-                url,
                 chat_id,
                 input_audio,
             ):
@@ -453,13 +390,6 @@ def main():
         title_markdown = gr.Markdown(f"# {title}")
         with gr.Row():
             with gr.Column():
-                with gr.Accordion("Usage"):
-                    gr.HTML(
-                        f"<li>Note: FastRTC's built-in VAD is quite sensitive. For better stability across environments, this demo uses a manual end-of-speech flow. It simply detects if the microphone is muted. That may lead to a bad experience when using auto-denoise microphone. We are trying to find a stable VAD model that works well with FastRTC.</li>"
-                        f"<li>Click Request Microphone to grant permission, click Record to start a turn, and click Stop to end the turn and clear the conversation history.</li>"
-                        f"<li>After you finish speaking, click the microphone icon {get_microphone_svg()} to end your input and wait for MiMo's reply.</li>"
-                        f"<li>While MiMo is speaking, you can interrupt by clicking the muted microphone icon {get_microphone_svg(muted=True)} and then speaking a new instruction.</li>"
-                    )
                 chat = fastrtc.WebRTC(
                     label="WebRTC Chat",
                     modality="audio",
@@ -484,7 +414,7 @@ def main():
                         "- `Preset Prompt` controls the response style.\n"
                         "- `Preset Voice` controls the speaking tone.\n"
                         "- `Custom Prompt` lets you define the response style in natural language (overrides `Preset Prompt`).\n"
-                        "- For best results, choose prompts and voices that match your language.\n"
                         "- To apply new settings, end the current conversation and start a new one."
                     )
                 preset_character_dropdown = gr.Dropdown(
@@ -503,7 +433,7 @@ def main():
                 )
         chat.stream(
-            ReplyOnMuted(response),
             inputs=[
                 chat,
                 preset_character_dropdown,
@@ -526,7 +456,7 @@ def main():
             outputs=[title_markdown, preset_character_dropdown, preset_voice_dropdown],
         )
-    demo.launch(show_api=False)
 if __name__ == "__main__":

 import random
 import time
 from threading import Thread
+from typing import Any, Literal, override
 import fastrtc
 import gradio as gr
 import httpx
+import librosa
 import numpy as np
 from api_schema import (
     TokenizedConversation,
     TokenizedMessage,
 )
+from webrtc_vad import VADStreamHandler
 HF_TOKEN = os.getenv("HF_TOKEN")
 SERVER_LIST = os.getenv("SERVER_LIST")
 def get_cloudflare_turn_credentials(
+    ttl: int = 3600,  # 1 hour
 ) -> dict[str, Any]:
     with httpx.Client() as client:
         response = client.post(
             )
 class ConversationManager:
     def __init__(self, assistant_style: AssistantStyle | None = None):
         self.conversation = TokenizedConversation(messages=[])
     def append_audio_chunk(self, audio_chunk: tuple[int, np.ndarray]):
         sr, audio_data = audio_chunk
+        target_sr = 24000
+        if sr != target_sr:
+            audio_data = librosa.resample(
+                audio_data.astype(np.float32) / 32768.0,
+                orig_sr=sr,
+                target_sr=target_sr,
+            )
+            audio_data = (audio_data * 32767.0).astype(np.int16)
+            sr = target_sr
         if audio_data.ndim > 1:
             # [channels, samples] -> [samples,]
             # Not Gradio style
     def chat(
         self,
         chat_id: int,
         input_audio: tuple[int, np.ndarray],
         global_sampler_config: SamplerConfig | None = None,
         chat_queue = queue.Queue[ChatResponseItem | None]()
         def chat_task():
+            url = chat_server_url("/audio-chat")
             req = ChatRequestBody(
                 conversation=self.conversation,
                 input_audio=ChatAudioBytes.from_audio(input_audio),
             )
             first_output = True
             with httpx.Client() as client:
                 with client.stream(
                     method="POST",
                     url=url,
                     content=req.model_dump_json(),
+                    headers={"Content-Type": "application/json", **auth_headers()},
                 ) as response:
                     if response.status_code != 200:
                         raise RuntimeError(f"Error {response.status_code}")
                 yield None
 class ConversationAbortController(AbortController):
     manager: ConversationManager
     cur_turn: int | None
     return chat_id
+def parse_gradio_audio(gradio_audio: tuple[int, np.ndarray]):
+    sr, audio = gradio_audio
+    if len(audio.shape) > 1:
+        # [samples, channels] -> [channels, samples]
+        audio = audio.T
+    if audio.dtype == np.int32:
+        audio = audio.astype(np.float32) / 2**31
+    # [samples] or [channels, samples]
+    return sr, audio
 def main():
     print("Starting WebRTC server")
         yield additional_outputs()
         try:
             for chunk in manager.chat(
                 chat_id,
                 input_audio,
             ):
         title_markdown = gr.Markdown(f"# {title}")
         with gr.Row():
             with gr.Column():
                 chat = fastrtc.WebRTC(
                     label="WebRTC Chat",
                     modality="audio",
                         "- `Preset Prompt` controls the response style.\n"
                         "- `Preset Voice` controls the speaking tone.\n"
                         "- `Custom Prompt` lets you define the response style in natural language (overrides `Preset Prompt`).\n"
+                        "- For best results, choose prompts and voices that **match your language**. The default settings are optimized for **English**.\n"
                         "- To apply new settings, end the current conversation and start a new one."
                     )
                 preset_character_dropdown = gr.Dropdown(
                 )
         chat.stream(
+            VADStreamHandler(response),
             inputs=[
                 chat,
                 preset_character_dropdown,
             outputs=[title_markdown, preset_character_dropdown, preset_voice_dropdown],
         )
+    demo.launch(server_name="0.0.0.0", server_port=8087, show_api=False)
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-fastapi==0.116.1
-pydantic==2.11.7
 fastrtc==0.0.33
-gradio==5.44.1
 httpx==0.28.1

 fastrtc==0.0.33
+gradio==5.35.0
 httpx==0.28.1
+numpy==2.2.6
+pydantic==2.11.7
+ten-vad @ git+https://github.com/TEN-framework/ten-vad.git

webrtc_vad.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from dataclasses import dataclass
+from typing import Callable, Generator, override
+import fastrtc
+import librosa
+import numpy as np
+from ten_vad import TenVad
+@dataclass
+class VADEvent:
+    interrupt_signal: bool | None = None
+    full_audio: tuple[int, np.ndarray] | None = None
+class RealtimeVAD:
+    def __init__(
+        self,
+        src_sr: int = 24000,
+        hop_size: int = 256,
+        start_threshold: float = 0.8,
+        end_threshold: float = 0.7,
+        pad_start_s: float = 0.6,
+        min_positive_s: float = 0.4,
+        min_silence_s: float = 1.2,
+    ):
+        self.src_sr = src_sr
+        self.vad_sr = 16000
+        self.hop_size = hop_size
+        self.start_threshold = start_threshold
+        self.end_threshold = end_threshold
+        self.pad_start_s = pad_start_s
+        self.min_positive_s = min_positive_s
+        self.min_silence_s = min_silence_s
+        self.vad_model = TenVad(hop_size=hop_size)
+        self.vad_buffer = np.array([], dtype=np.int16)
+        """
+        VAD Buffer to store audio data for VAD processing
+        Stores 16kHz int16 PCM. Process and cut for each `hop_size` samples.
+        """
+        self.src_buffer = np.array([], dtype=np.int16)
+        """
+        Source Buffer to store original audio data
+        Stores original sampling rate (24kHz) int16 PCM.
+        Cut when pause detected (after `min_silence_s`).
+        Sliding window `pad_start_s` when inactive.
+        """
+        self.vad_buffer_offset = 0
+        self.src_buffer_offset = 0
+        self.active = False
+        self.interrupt_signal = False
+        self.sum_positive_s = 0.0
+        self.silence_start_s: float | None = None
+    def process(self, audio_data: np.ndarray):
+        if audio_data.ndim == 2:
+            # FastRTC style [channels, samples]
+            audio_data = audio_data[0]
+        # Append to buffers
+        self.src_buffer = np.concatenate((self.src_buffer, audio_data))
+        vad_audio_data = librosa.resample(
+            audio_data.astype(np.float32) / 32768.0,
+            orig_sr=self.src_sr,
+            target_sr=self.vad_sr,
+        )
+        vad_audio_data = (vad_audio_data * 32767.0).round().astype(np.int16)
+        self.vad_buffer = np.concatenate((self.vad_buffer, vad_audio_data))
+        vad_buffer_size = self.vad_buffer.shape[0]
+        def process_chunk(chunk_offset_s: float, vad_chunk: np.ndarray):
+            speech_prob, _ = self.vad_model.process(vad_chunk)
+            hop_s = self.hop_size / self.vad_sr
+            if not self.active:
+                if speech_prob >= self.start_threshold:
+                    self.active = True
+                    self.sum_positive_s = hop_s
+                    print(f"[VAD] Active at {chunk_offset_s:.2f}s, {speech_prob=:.3f}")
+                else:
+                    new_src_offset = int(
+                        (chunk_offset_s - self.pad_start_s) * self.src_sr
+                    )
+                    cut_pos = new_src_offset - self.src_buffer_offset
+                    if cut_pos > 0:
+                        self.src_buffer = self.src_buffer[cut_pos:]
+                        self.src_buffer_offset = new_src_offset
+                return
+            chunk_src_pos = int(chunk_offset_s * self.src_sr)
+            if speech_prob >= self.end_threshold:
+                self.silence_start_s = None
+                self.sum_positive_s += hop_s
+                if (
+                    not self.interrupt_signal
+                    and self.sum_positive_s >= self.min_positive_s
+                ):
+                    self.interrupt_signal = True
+                    yield VADEvent(interrupt_signal=True)
+                    print(
+                        f"[VAD] Interrupt signal at {chunk_offset_s:.2f}s, {speech_prob=:.3f}"
+                    )
+            elif self.silence_start_s is None:
+                self.silence_start_s = chunk_offset_s
+            if (
+                self.silence_start_s is not None
+                and chunk_offset_s - self.silence_start_s >= self.min_silence_s
+            ):
+                # Inactive now
+                cut_pos = chunk_src_pos - self.src_buffer_offset
+                if self.interrupt_signal:
+                    webrtc_audio = self.src_buffer[np.newaxis, :cut_pos]
+                    yield VADEvent(full_audio=(self.src_sr, webrtc_audio))
+                    print(
+                        f"[VAD] Full audio at {chunk_offset_s:.2f}s, {webrtc_audio.shape=}"
+                    )
+                self.src_buffer = self.src_buffer[cut_pos:]
+                self.src_buffer_offset = chunk_src_pos
+                self.active = False
+                self.interrupt_signal = False
+                self.sum_positive_s = 0.0
+                self.silence_start_s = None
+        for chunk_pos in range(0, vad_buffer_size - self.hop_size, self.hop_size):
+            processed_samples = chunk_pos + self.hop_size
+            chunk_offset_s = (self.vad_buffer_offset + chunk_pos) / self.vad_sr
+            vad_chunk = self.vad_buffer[chunk_pos : chunk_pos + self.hop_size]
+            yield from process_chunk(chunk_offset_s, vad_chunk)
+        self.vad_buffer = self.vad_buffer[processed_samples:]
+        self.vad_buffer_offset += processed_samples
+type StreamerGenerator = Generator[fastrtc.tracks.EmitType, None, None]
+type StreamerFn = Callable[[tuple[int, np.ndarray], str], StreamerGenerator]
+class VADStreamHandler(fastrtc.StreamHandler):
+    def __init__(
+        self,
+        streamer_fn: StreamerFn,
+        input_sample_rate: int = 24000,
+    ):
+        super().__init__(
+            "mono",
+            24000,
+            None,
+            input_sample_rate,
+            30,
+        )
+        self.streamer_fn = streamer_fn
+        self.realtime_vad = RealtimeVAD(src_sr=input_sample_rate)
+        self.generator: StreamerGenerator | None = None
+    @override
+    def emit(self) -> fastrtc.tracks.EmitType:
+        if self.generator is None:
+            return None
+        try:
+            return next(self.generator)
+        except StopIteration:
+            self.generator = None
+            return None
+    @override
+    def receive(self, frame: tuple[int, np.ndarray]):
+        _, audio_data = frame
+        for event in self.realtime_vad.process(audio_data):
+            if event.interrupt_signal:
+                self.generator = None
+                self.clear_queue()
+            if event.full_audio is not None:
+                self.wait_for_args_sync()
+                self.latest_args[0] = event.full_audio
+                self.generator = self.streamer_fn(*self.latest_args)
+    @override
+    def copy(self):
+        return VADStreamHandler(
+            self.streamer_fn,
+            input_sample_rate=self.input_sample_rate,
+        )