Spaces:

AIDC-AI
/

Ovis2.5-9B

Running on Zero

App Files Files Community

玙珲 commited on Aug 15

Commit

939e0e4

1 Parent(s): eb0a0f3

support multi-turn, video

Browse files

Files changed (3) hide show

.gitattributes +1 -0
app.py +183 -90
examples/video_demo.mp4 +3 -0

.gitattributes CHANGED Viewed

@@ -35,3 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -3,25 +3,74 @@ subprocess.run('pip install flash-attn==2.7.0.post2 --no-build-isolation', env={
 import spaces
 import argparse
 import os
 import re
-from typing import List, Optional, Tuple
 import gradio as gr
 import PIL.Image
 import torch
 import numpy as np
 from moviepy.editor import VideoFileClip
-from transformers import AutoModelForCausalLM
-# --- Global Model Variable ---
-# model = None
 # This should point to the directory containing your SVG file.
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
 # --- Helper Functions ---
 def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]:
     """Extracts a specified number of frames from a video file."""
@@ -42,44 +91,62 @@ def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[
 def parse_model_output(response_text: str, enable_thinking: bool) -> str:
     """Formats the model output, separating 'thinking' and 'response' parts if enabled."""
     if enable_thinking:
         think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
         if think_match:
             thinking_content = think_match.group(1).strip()
             response_content = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
             return f"**Thinking:**\n```\n{thinking_content}\n```\n\n**Response:**\n{response_content}"
         else:
-            return response_text
     else:
-        return response_text
-# --- Core Inference Logic ---
-@spaces.GPU
 def run_inference(
     image_input: Optional[PIL.Image.Image],
     video_input: Optional[str],
-    prompt: str,
     do_sample: bool,
     max_new_tokens: int,
     enable_thinking: bool,
-) -> List[List[str]]:
-    """Runs a single turn of inference and formats the output for a gr.Chatbot."""
     if (not image_input and not video_input and not prompt) or not prompt:
         gr.Warning("A text prompt is required for generation.")
-        return []
     content = []
     if image_input:
         content.append({"type": "image", "image": image_input})
     if video_input:
         frames = load_video_frames(video_input)
-        if frames: content.append({"type": "video", "video": frames})
         else:
             gr.Warning("Failed to process the video file.")
-            return [[prompt, "Error: Could not process the video file."]]
     content.append({"type": "text", "text": prompt})
     messages = [{"role": "user", "content": content}]
     try:
         if video_input:
@@ -87,7 +154,9 @@ def run_inference(
         else:
             input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
     except Exception as e:
-        return [[prompt, f"Error during input preprocessing: {e}"]]
     input_ids = input_ids.to(model.device)
     if pixel_values is not None:
@@ -96,48 +165,90 @@ def run_inference(
         grid_thws = grid_thws.to(model.device)
     gen_kwargs = {
-        "max_new_tokens": max_new_tokens, "do_sample": do_sample,
-        "eos_token_id": model.text_tokenizer.eos_token_id, "pad_token_id": model.text_tokenizer.pad_token_id
     }
-    with torch.inference_mode():
-        try:
-            outputs = model.generate(inputs=input_ids, pixel_values=pixel_values, grid_thws=grid_thws, **gen_kwargs)
-        except Exception as e:
-            return [[prompt, f"Error during model generation: {e}"]]
-    response_text = model.text_tokenizer.decode(outputs[0], skip_special_tokens=True)
-    formatted_response = parse_model_output(response_text, enable_thinking)
-    return [[prompt, formatted_response]]
 # --- UI Helper Functions ---
 def toggle_media_input(choice: str) -> Tuple:
     """Switches visibility between Image/Video inputs and their corresponding examples."""
     if choice == "Image":
         return gr.update(visible=True, value=None), gr.update(visible=False, value=None), gr.update(visible=True), gr.update(visible=False)
-    else: # Video
         return gr.update(visible=False, value=None), gr.update(visible=True, value=None), gr.update(visible=False), gr.update(visible=True)
 # --- Build Gradio Application ---
 # @spaces.GPU
 def build_demo(model_path: str):
     """Builds the Gradio user interface for the model."""
-    global model
-    device = f"cuda"
     print(f"Loading model {model_path} onto device {device}...")
     model = AutoModelForCausalLM.from_pretrained(
-        model_path, torch_dtype=torch.bfloat16, trust_remote_code=True
     ).to(device).eval()
     print("Model loaded successfully.")
     model_name_display = model_path.split('/')[-1]
-    # --- Logo & Header ---
     logo_html = ""
     logo_svg_path = os.path.join(CUR_DIR, "resource", "logo.svg")
     if os.path.exists(logo_svg_path):
@@ -147,7 +258,6 @@ def build_demo(model_path: str):
         svg_content_styled = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
         logo_html = f'<span style="display: inline-block; vertical-align: middle;">{svg_content_styled}</span>'
     else:
-        # Fallback if SVG is not found
         logo_html = '<span style="font-weight: bold; font-size: 2.5em; display: inline-block; vertical-align: middle;">Ovis</span>'
         print(f"Warning: Logo file not found at {logo_svg_path}. Using text fallback.")
@@ -159,26 +269,23 @@ def build_demo(model_path: str):
     <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
     """
     with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         gr.HTML(html_header)
-        gr.Markdown(f"This interface is served by a single model. Each submission starts a new, independent conversation.")
         with gr.Row():
-            # --- Left Column (Media Inputs, Settings, Prompt & Actions) ---
             with gr.Column(scale=4):
-                input_type_radio = gr.Radio(choices=["Image"], value="Image", label="Select Input Type")
                 image_input = gr.Image(label="Image Input", type="pil", visible=True)
                 video_input = gr.Video(label="Video Input", visible=False)
                 with gr.Accordion("Generation Settings", open=True):
-                    do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=False)
                     max_new_tokens = gr.Slider(minimum=32, maximum=4096, value=1024, step=32, label="Max New Tokens")
-                    enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=True)
-                prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=3)
-                with gr.Row():
-                    generate_btn = gr.Button("Send", variant="primary")
-                    clear_btn = gr.Button("Clear", variant="secondary")
                 with gr.Column(visible=True) as image_examples_col:
                     gr.Examples(
@@ -191,34 +298,39 @@ def build_demo(model_path: str):
                         ],
                         inputs=[image_input, prompt_input]
                     )
-                # with gr.Column(visible=False) as video_examples_col:
-                #     gr.Examples(examples=[[os.path.join(CUR_DIR, "examples", "video_demo_1.mp4"), "Describe the video."]],
-                #         inputs=[video_input, prompt_input])
-            # --- Right Column (Chat Display) ---
-            with gr.Column(scale=6):
-                chatbot = gr.Chatbot(label="Ovis", height=750, show_copy_button=True, layout="panel")
-        # --- Event Handlers ---
         input_type_radio.change(
             fn=toggle_media_input,
             inputs=input_type_radio,
-            outputs=[image_input, video_input, image_examples_col]
         )
-        run_inputs = [image_input, video_input, prompt_input, do_sample, max_new_tokens, enable_thinking]
-        generate_btn.click(fn=run_inference, inputs=run_inputs, outputs=chatbot)
-        prompt_input.submit(fn=run_inference, inputs=run_inputs, outputs=chatbot)
         clear_btn.click(
-            fn=lambda: ([], None, None, "", "Image", False, 1024, True),
             outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking]
         ).then(
              fn=toggle_media_input,
              inputs=input_type_radio,
-             outputs=[image_input, video_input, image_examples_col]
         )
     return demo
 # --- Main Execution Block ---
@@ -230,30 +342,11 @@ def build_demo(model_path: str):
 #     parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Server name for the Gradio app.")
 #     return parser.parse_args()
-# if __name__ == "__main__":
-#     if not os.path.exists("examples"): os.makedirs("examples")
-#     if not os.path.exists("resource"): os.makedirs("resource")
-#     print("Note: For the logo to display correctly, place 'logo.svg' inside the 'resource' directory.")
-    # example_files = [
-    #     "ovis2_math0.jpg",
-    #     "ovis2_math1.jpg",
-    #     "ovis2_figure0.png",
-    #     "ovis2_figure1.png",
-    #     "ovis2_multi0.jpg",
-    #     "video_demo_1.mp4",
-    # ]
-    # for fname in example_files:
-    #     fpath = os.path.join("examples", fname)
-    #     if not os.path.exists(fpath):
-    #         if fname.endswith(".mp4"):
-    #             os.system(f'ffmpeg -y -f lavfi -i "smptebars=size=128x72:rate=10" -t 3 -pix_fmt yuv420p "{fpath}" >/dev/null 2>&1')
-    #         else:
-    #             PIL.Image.new('RGB', (224, 224), color = 'grey').save(fpath)
 model_path = 'AIDC-AI/Ovis2.5-9B'
 demo = build_demo(model_path=model_path)
-# print(f"Launching Gradio app on http://{args.server_name}:{args.port}")
-# demo.queue().launch(server_name=args.server_name, server_port=args.port, share=False, ssl_verify=False)
-demo.launch()

 import spaces
 import argparse
 import os
 import re
+import logging
+from typing import List, Optional, Tuple, Generator
+from threading import Thread
 import gradio as gr
 import PIL.Image
 import torch
 import numpy as np
 from moviepy.editor import VideoFileClip
+from transformers import AutoModelForCausalLM, TextIteratorStreamer
+logging.getLogger("httpx").setLevel(logging.WARNING)
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# --- Global Model Variables ---
+model = None
+streamer = None
 # This should point to the directory containing your SVG file.
 CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+def submit_chat(chatbot, text_input):
+    response = ''
+    chatbot.append([text_input, response])
+    return chatbot, ''
 # --- Helper Functions ---
+latex_delimiters_set = [
+    {
+        "left": "\\(",
+        "right": "\\)",
+        "display": False
+    },
+    {
+        "left": "\\begin{equation}",
+        "right": "\\end{equation}",
+        "display": True
+    },
+    {
+        "left": "\\begin{align}",
+        "right": "\\end{align}",
+        "display": True
+    },
+    {
+        "left": "\\begin{alignat}",
+        "right": "\\end{alignat}",
+        "display": True
+    },
+    {
+        "left": "\\begin{gather}",
+        "right": "\\end{gather}",
+        "display": True
+    },
+    {
+        "left": "\\begin{CD}",
+        "right": "\\end{CD}",
+        "display": True
+    },
+    {
+        "left": "\\[",
+        "right": "\\]",
+        "display": True
+    }
+]
 def load_video_frames(video_path: Optional[str], n_frames: int = 8) -> Optional[List[PIL.Image.Image]]:
     """Extracts a specified number of frames from a video file."""
 def parse_model_output(response_text: str, enable_thinking: bool) -> str:
     """Formats the model output, separating 'thinking' and 'response' parts if enabled."""
     if enable_thinking:
+        # Use a more robust regex to handle nested content and variations
         think_match = re.search(r"<think>(.*?)</think>", response_text, re.DOTALL)
         if think_match:
             thinking_content = think_match.group(1).strip()
+            # Remove the think block from the original text to get the response
             response_content = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
             return f"**Thinking:**\n```\n{thinking_content}\n```\n\n**Response:**\n{response_content}"
         else:
+            return response_text # No think tag found, return as is
     else:
+        # If thinking is disabled, strip the tags just in case the model still generates them
+        return re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
+# --- MODIFIED Core Inference Logic (Now with Streaming) ---
+# @spaces.GPU
 def run_inference(
+    chatbot: List,
     image_input: Optional[PIL.Image.Image],
     video_input: Optional[str],
     do_sample: bool,
     max_new_tokens: int,
     enable_thinking: bool,
+):
+    """
+    Runs a single turn of inference and yields the output stream for a gr.Chatbot.
+    This function is now a generator.
+    """
+    prompt = chatbot[-1][0]
     if (not image_input and not video_input and not prompt) or not prompt:
         gr.Warning("A text prompt is required for generation.")
+        # MODIFICATION: Yield the current state and return to avoid errors
+        yield chatbot
+        return
+    # MODIFICATION: Append the new prompt to the existing history
+    # chatbot.append([prompt, ""])
+    # yield chatbot, "" # Yield the updated chat to show the user's prompt immediately
     content = []
     if image_input:
         content.append({"type": "image", "image": image_input})
     if video_input:
         frames = load_video_frames(video_input)
+        if frames:
+            content.append({"type": "video", "video": frames})
         else:
             gr.Warning("Failed to process the video file.")
+            chatbot[-1][1] = "Error: Could not process the video file."
+            yield chatbot
+            return
     content.append({"type": "text", "text": prompt})
     messages = [{"role": "user", "content": content}]
+    logger.info(messages)
     try:
         if video_input:
         else:
             input_ids, pixel_values, grid_thws = model.preprocess_inputs(messages=messages, add_generation_prompt=True, enable_thinking=enable_thinking)
     except Exception as e:
+        chatbot[-1][1] = f"Error during input preprocessing: {e}"
+        yield chatbot
+        return
     input_ids = input_ids.to(model.device)
     if pixel_values is not None:
         grid_thws = grid_thws.to(model.device)
     gen_kwargs = {
+        "max_new_tokens": max_new_tokens,
+        "do_sample": do_sample,
+        "eos_token_id": model.text_tokenizer.eos_token_id,
+        "pad_token_id": model.text_tokenizer.pad_token_id,
+        "streamer": streamer,
+        "use_cache": True
     }
+    with torch.inference_mode():
+        thread = Thread(target=model.generate, kwargs={
+            "inputs": input_ids,
+            "pixel_values": pixel_values,
+            "grid_thws": grid_thws,
+            **gen_kwargs
+        })
+        thread.start()
+        # MODIFICATION: Stream output token by token
+        response_text = ""
+        for new_text in streamer:
+            response_text += new_text
+            # Append only the new text chunk to the last response
+            chatbot[-1][1] = response_text
+            yield chatbot # Yield the updated history
+        thread.join()
+        # MODIFICATION: Format the final response once generation is complete
+        formatted_response = parse_model_output(response_text, enable_thinking)
+        chatbot[-1][1] = formatted_response
+        yield chatbot # Yield the final, formatted response
+        logger.info("[OVIS_CONV_START]")
+        [print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
+        # print('New_Q:\n', text_input)
+        # print('New_A:\n', response)
+        logger.info("[OVIS_CONV_END]")
+def clear_chat():
+    return [], None, ""
 # --- UI Helper Functions ---
 def toggle_media_input(choice: str) -> Tuple:
     """Switches visibility between Image/Video inputs and their corresponding examples."""
     if choice == "Image":
         return gr.update(visible=True, value=None), gr.update(visible=False, value=None), gr.update(visible=True), gr.update(visible=False)
+    else:  # Video
         return gr.update(visible=False, value=None), gr.update(visible=True, value=None), gr.update(visible=False), gr.update(visible=True)
+# # --- MODIFIED: New function to handle chat state and input clearing ---
+# def process_and_clear(chatbot: List, image_input: PIL.Image.Image, video_input: str, prompt: str, do_sample: bool, max_new_tokens: int, enable_thinking: bool):
+#     """
+#     This function now takes the chatbot state as input to maintain conversation history
+#     and clears the prompt box after submission.
+#     """
+#     # Create a generator by calling the main run_inference function
+#     generator = run_inference(chatbot, image_input, video_input, prompt, do_sample, max_new_tokens, enable_thinking)
+#     # Yield from the generator
+#     for chatbot_state, _ in generator:
+#         yield chatbot_state, "" # Clear prompt after first yield
 # --- Build Gradio Application ---
 # @spaces.GPU
 def build_demo(model_path: str):
     """Builds the Gradio user interface for the model."""
+    global model, streamer
+    device = "cuda"
     print(f"Loading model {model_path} onto device {device}...")
     model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True
     ).to(device).eval()
+    text_tokenizer = model.text_tokenizer
+    streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
     print("Model loaded successfully.")
     model_name_display = model_path.split('/')[-1]
     logo_html = ""
     logo_svg_path = os.path.join(CUR_DIR, "resource", "logo.svg")
     if os.path.exists(logo_svg_path):
         svg_content_styled = re.sub(r'(<svg[^>]*)(>)', rf'\1 height="{font_size}" style="vertical-align: middle; display: inline-block;"\2', svg_content)
         logo_html = f'<span style="display: inline-block; vertical-align: middle;">{svg_content_styled}</span>'
     else:
         logo_html = '<span style="font-weight: bold; font-size: 2.5em; display: inline-block; vertical-align: middle;">Ovis</span>'
         print(f"Warning: Logo file not found at {logo_svg_path}. Using text fallback.")
     <center><font size=3><b>Ovis</b> has been open-sourced on <a href='https://huggingface.co/{model_path}'>😊 Huggingface</a> and <a href='https://github.com/AIDC-AI/Ovis'>🌟 GitHub</a>. If you find Ovis useful, a like❤️ or a star🌟 would be appreciated.</font></center>
     """
+    prompt_input = gr.Textbox(label="Prompt", placeholder="Enter your text here and press ENTER", lines=3, container=False)
     with gr.Blocks(theme=gr.themes.Ocean()) as demo:
         gr.HTML(html_header)
+        gr.Markdown("Note: you might have to increase the \"Max New Tokens\" and wait longer to obtain answer when Deep Thinking is enabled.")
         with gr.Row():
             with gr.Column(scale=4):
+                input_type_radio = gr.Radio(choices=["Image", "Video"], value="Image", label="Select Input Type")
                 image_input = gr.Image(label="Image Input", type="pil", visible=True)
                 video_input = gr.Video(label="Video Input", visible=False)
                 with gr.Accordion("Generation Settings", open=True):
+                    do_sample = gr.Checkbox(label="Enable Sampling (Do Sample)", value=True)
                     max_new_tokens = gr.Slider(minimum=32, maximum=4096, value=1024, step=32, label="Max New Tokens")
+                    enable_thinking = gr.Checkbox(label="Enable Deep Thinking", value=False)
                 with gr.Column(visible=True) as image_examples_col:
                     gr.Examples(
                         ],
                         inputs=[image_input, prompt_input]
                     )
+                with gr.Column(visible=False) as video_examples_col:
+                     gr.Examples(examples=[[os.path.join(CUR_DIR, "examples", "video_demo.mp4"), "Describe the video."]],
+                         inputs=[video_input, prompt_input])
+            with gr.Column(scale=7):
+                chatbot = gr.Chatbot(label="Ovis", height=750, show_copy_button=True, layout="panel", latex_delimiters=latex_delimiters_set)
+                prompt_input.render()
+                with gr.Row():
+                    generate_btn = gr.Button("Send", variant="primary")
+                    clear_btn = gr.Button("Clear", variant="secondary")
         input_type_radio.change(
             fn=toggle_media_input,
             inputs=input_type_radio,
+            outputs=[image_input, video_input, image_examples_col, video_examples_col]
         )
+        # MODIFICATION: Update event handlers to use the new function and manage state
+        run_inputs = [chatbot, image_input, video_input, do_sample, max_new_tokens, enable_thinking]
+        # run_outputs = [image_input, prompt_input]
+        generat_click_event = generate_btn.click(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
+        submit_event = prompt_input.submit(submit_chat, [chatbot, prompt_input], [chatbot, prompt_input]).then(run_inference, run_inputs, chatbot)
         clear_btn.click(
+            fn=lambda: ([], None, None, "", "Image", True, 1024, False),
             outputs=[chatbot, image_input, video_input, prompt_input, input_type_radio, do_sample, max_new_tokens, enable_thinking]
         ).then(
              fn=toggle_media_input,
              inputs=input_type_radio,
+             outputs=[image_input, video_input, image_examples_col, video_examples_col]
         )
     return demo
 # --- Main Execution Block ---
 #     parser.add_argument("--server-name", type=str, default="0.0.0.0", help="Server name for the Gradio app.")
 #     return parser.parse_args()
+# if __name__ == "__main__":
+#     args = parse_args()
 model_path = 'AIDC-AI/Ovis2.5-9B'
 demo = build_demo(model_path=model_path)
+# demo = build_demo(model_path=args.model_path)
+# demo.launch(server_name=args.server_name, server_port=args.port, share=False, ssl_verify=False, show_error=True)
+demo.queue().launch()

examples/video_demo.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4476e4fd82da4fc37b4c167ec6a4f56fa270c0ad3f2724fd47c0ff92b87d6c6
+size 103118