Spaces:
Running
on
Zero
Running
on
Zero
| # modified from https://github.com/XiaomiMiMo/MiMo-VL/tree/main/app.py | |
| import os | |
| import gradio as gr | |
| from infer import MiMoVLInfer | |
| import spaces | |
| # infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL") | |
| infer = MiMoVLInfer(checkpoint_path="XiaomiMiMo/MiMo-VL-7B-RL-2508") | |
| label_translations = { | |
| "gr_chatinterface_ofl": { | |
| "English": "Chatbot", | |
| }, | |
| "gr_chatinterface_ol": { | |
| "English": "Chatbot", | |
| }, | |
| "gr_tab_ol": { | |
| "English": "Online", | |
| }, | |
| "gr_tab_ofl": { | |
| "English": "Offline", | |
| }, | |
| "gr_temperature": { | |
| "English": "Temperature", | |
| }, | |
| "gr_webcam_image": { | |
| "English": "🤳 Open Webcam", | |
| }, | |
| "gr_webcam_images": { | |
| "English": "📹 Recorded Frames", | |
| }, | |
| "gr_chatinterface_ofl.textbox.placeholder": { | |
| "English": | |
| "Ask me anything. You can also drop in images and .mp4 videos.", | |
| }, | |
| "gr_chatinterface_ol.textbox.placeholder": { | |
| "English": "Ask me anything...", | |
| } | |
| } | |
| def offline_chat(gr_inputs: dict, gr_history: list, infer_history: list, temperature: float): | |
| infer.to_device("cuda") | |
| try: | |
| yield [{"role": "assistant", "content": "⏳ Reserving GPU & preparing inference…"}], infer_history | |
| for response_text, infer_history in infer(inputs=gr_inputs, | |
| history=infer_history, | |
| temperature=temperature): | |
| if response_text.startswith('<think>') and '</think>' not in response_text: | |
| reasoning_text = response_text.lstrip('<think>') | |
| response_message = [{ | |
| "role": "assistant", | |
| "content": reasoning_text, | |
| 'metadata': {'title': '🤔 Thinking'} | |
| }] | |
| yield response_message, infer_history | |
| elif '<think>' in response_text and '</think>' in response_text: | |
| reasoning_text, response_text2 = response_text.split('</think>', 1) | |
| reasoning_text = reasoning_text.lstrip('<think>') | |
| response_message = [{ | |
| "role": "assistant", | |
| "content": reasoning_text, | |
| 'metadata': {'title': '🤔 Thinking'} | |
| }, { | |
| "role": "assistant", | |
| "content": response_text2 | |
| }] | |
| yield response_message, infer_history | |
| else: | |
| yield [{"role": "assistant", "content": response_text}], infer_history | |
| finally: | |
| infer.to_device("cpu") | |
| def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, gr_counter: int, | |
| infer_history: list, temperature: float): | |
| infer.to_device("cuda") | |
| try: | |
| if not gr_webcam_images: | |
| gr_webcam_images = [] | |
| gr_webcam_images = gr_webcam_images[gr_counter:] | |
| inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]} | |
| # send an immediate chunk | |
| yield f'received {len(gr_webcam_images)} new frames, processing…', gr_counter + len(gr_webcam_images), infer_history | |
| for response_message, infer_history in offline_chat( | |
| inputs, gr_history, infer_history, temperature): | |
| yield response_message, gr.skip(), infer_history | |
| finally: | |
| infer.to_device("cpu") | |
| with gr.Blocks() as demo: | |
| gr.Markdown("""<center><font size=8>MiMo-7b-VL</center>""") | |
| with gr.Column(): | |
| # gr_title = gr.Markdown('# MiMo-VL') | |
| with gr.Row(): | |
| gr_lang_selector = gr.Dropdown(choices=["English"], | |
| value="English", | |
| label="🌐 Interface", | |
| interactive=True, | |
| min_width=250, | |
| scale=0) | |
| with gr.Tabs(): | |
| with gr.Tab("Offline") as gr_tab_ofl: | |
| gr_infer_history = gr.State([]) | |
| gr_temperature_hidden = gr.Slider(minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=1.0, | |
| interactive=True, | |
| visible=False) | |
| gr_chatinterface_ofl = gr.ChatInterface( | |
| fn=offline_chat, | |
| type="messages", | |
| multimodal=True, | |
| chatbot=gr.Chatbot(height=800), | |
| textbox=gr.MultimodalTextbox( | |
| file_count="multiple", | |
| file_types=["image", ".mp4"], | |
| sources=["upload"], | |
| stop_btn=True, | |
| placeholder=label_translations[ | |
| 'gr_chatinterface_ofl.textbox.placeholder']['English'], | |
| ), | |
| additional_inputs=[ | |
| gr_infer_history, gr_temperature_hidden | |
| ], | |
| additional_outputs=[gr_infer_history], | |
| ) | |
| gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear], | |
| fn=lambda: [], | |
| outputs=[gr_infer_history]) | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=200): | |
| gr_temperature_ofl = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=0.4, | |
| label=label_translations['gr_temperature']['English'], | |
| interactive=True) | |
| gr_temperature_ofl.change(lambda x: x, | |
| inputs=gr_temperature_ofl, | |
| outputs=gr_temperature_hidden) | |
| with gr.Column(scale=8): | |
| with gr.Column(visible=True) as gr_examples_en: | |
| gr.Examples( | |
| examples=[ | |
| { | |
| "text": "Who are you?", | |
| "files": [] | |
| }, | |
| ], | |
| inputs=[gr_chatinterface_ofl.textbox], | |
| ) | |
| with gr.Tab("Online") as gr_tab_ol: | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr_infer_history = gr.State([]) | |
| gr_temperature_hidden = gr.Slider(minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=1.0, | |
| interactive=True, | |
| visible=False) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr_webcam_image = gr.Image( | |
| label=label_translations['gr_webcam_image'] | |
| ['English'], | |
| sources="webcam", | |
| height=250, | |
| type='filepath') | |
| gr_webcam_images = gr.Gallery( | |
| label=label_translations['gr_webcam_images'] | |
| ['English'], | |
| show_label=True, | |
| format='webp', | |
| columns=1, | |
| height=250, | |
| preview=True, | |
| interactive=False) | |
| gr_counter = gr.Number(value=0, visible=False) | |
| with gr.Column(scale=3): | |
| gr_chatinterface_ol = gr.ChatInterface( | |
| fn=online_record_chat, | |
| type="messages", | |
| multimodal=False, | |
| chatbot=gr.Chatbot(height=800), | |
| textbox=gr. | |
| Textbox(placeholder=label_translations[ | |
| 'gr_chatinterface_ol.textbox.placeholder'] | |
| ['English'], | |
| submit_btn=True, | |
| stop_btn=True), | |
| additional_inputs=[ | |
| gr_webcam_images, gr_counter, | |
| gr_infer_history, gr_temperature_hidden | |
| ], | |
| additional_outputs=[ | |
| gr_counter, gr_infer_history | |
| ], | |
| ) | |
| def cache_webcam(recorded_image: str, | |
| recorded_images: list): | |
| if not recorded_images: | |
| recorded_images = [] | |
| return recorded_images + [recorded_image] | |
| gr_webcam_image.stream( | |
| fn=cache_webcam, | |
| inputs=[gr_webcam_image, gr_webcam_images], | |
| outputs=[gr_webcam_images], | |
| stream_every=1, | |
| concurrency_limit=30, | |
| ) | |
| with gr.Row(): | |
| gr_temperature_ol = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| step=0.1, | |
| value=0.4, | |
| label=label_translations['gr_temperature'] | |
| ['English'], | |
| interactive=True) | |
| gr_temperature_ol.change( | |
| lambda x: x, | |
| inputs=gr_temperature_ol, | |
| outputs=gr_temperature_hidden) | |
| def update_lang(lang: str): | |
| return ( | |
| gr.update(label=label_translations['gr_chatinterface_ofl'][lang]), | |
| gr.update(label=label_translations['gr_chatinterface_ol'][lang]), | |
| gr.update(placeholder=label_translations[ | |
| 'gr_chatinterface_ofl.textbox.placeholder'][lang]), | |
| gr.update(placeholder=label_translations[ | |
| 'gr_chatinterface_ol.textbox.placeholder'][lang]), | |
| gr.update(label=label_translations['gr_tab_ofl'][lang]), | |
| gr.update(label=label_translations['gr_tab_ol'][lang]), | |
| gr.update(label=label_translations['gr_temperature'][lang]), | |
| gr.update(label=label_translations['gr_temperature'][lang]), | |
| gr.update(visible=lang == 'English'), | |
| gr.update(visible=lang != 'English'), | |
| gr.update(label=label_translations['gr_webcam_image'][lang]), | |
| gr.update(label=label_translations['gr_webcam_images'][lang]), | |
| ) | |
| gr_lang_selector.change(fn=update_lang, | |
| inputs=[gr_lang_selector], | |
| outputs=[ | |
| gr_chatinterface_ofl.chatbot, | |
| gr_chatinterface_ol.chatbot, | |
| gr_chatinterface_ofl.textbox, | |
| gr_chatinterface_ol.textbox, | |
| gr_tab_ofl, | |
| gr_tab_ol, | |
| gr_temperature_ofl, | |
| gr_temperature_ol, | |
| gr_examples_en, | |
| gr_webcam_image, | |
| gr_webcam_images, | |
| ]) | |
| demo.queue(default_concurrency_limit=2, max_size=50) | |
| if __name__ == "__main__": | |
| demo.launch() | |