Spaces:

prithivMLmods
/

Multimodal-OCR2

Running on Zero

App Files Files Community

prithivMLmods commited on Sep 25

Commit

2acc319

verified ·

1 Parent(s): d481cbf

Update app.py

Browse files

Files changed (1) hide show

app.py +319 -331

app.py CHANGED Viewed

@@ -1,10 +1,9 @@
 import os
 import random
 import uuid
-import spaces
 import time
-import base64
-from http import HTTPStatus
 from threading import Thread
 import gradio as gr
@@ -15,369 +14,358 @@ from PIL import Image, ImageOps
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForVision2Seq,
     AutoProcessor,
     TextIteratorStreamer,
 )
-from gradio_client import utils as client_utils
-import modelscope_studio.components.antd as antd
-import modelscope_studio.components.antdx as antdx
-import modelscope_studio.components.base as ms
-import modelscope_studio.components.pro as pro
-# --- Constants and Configuration ---
 MAX_MAX_NEW_TOKENS = 5120
 DEFAULT_MAX_NEW_TOKENS = 3072
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # --- Model Loading ---
-# A dictionary to hold our models and processors for easy access
-models = {}
-processors = {}
-MODEL_CHOICES = [
-    "Nanonets-OCR-s",
-    "MonkeyOCR-Recognition",
-    "Thyme-RL",
-    "Typhoon-OCR-7B",
-    "SmolDocling-256M-preview"
-]
-def load_model(model_id, processor_class, model_class, subfolder=None, model_key=''):
-    """Helper function to load a model and processor."""
-    print(f"Loading model: {model_key}...")
-    try:
-        processor_args = {"trust_remote_code": True}
-        model_args = {"trust_remote_code": True, "torch_dtype": torch.float16}
-        if subfolder:
-            processor_args["subfolder"] = subfolder
-            model_args["subfolder"] = subfolder
-        processors[model_key] = processor_class.from_pretrained(model_id, **processor_args)
-        models[model_key] = model_class.from_pretrained(model_id, **model_args).to(device).eval()
-        print(f"Successfully loaded {model_key}.")
-    except Exception as e:
-        print(f"Error loading model {model_key}: {e}")
-        # If a model fails to load, remove it from the choices
-        if model_key in MODEL_CHOICES:
-            MODEL_CHOICES.remove(model_key)
-# Load all models
-load_model("nanonets/Nanonets-OCR-s", AutoProcessor, Qwen2_5_VLForConditionalGeneration, model_key="Nanonets-OCR-s")
-load_model("echo80/MonkeyOCR", AutoProcessor, Qwen2_5_VLForConditionalGeneration, subfolder="Recognition", model_key="MonkeyOCR-Recognition")
-load_model("scb10x/typhoon-ocr-7b", AutoProcessor, Qwen2_5_VLForConditionalGeneration, model_key="Typhoon-OCR-7B")
-load_model("ds4sd/SmolDocling-256M-preview", AutoProcessor, AutoModelForVision2Seq, model_key="SmolDocling-256M-preview")
-load_model("Kwai-Keye/Thyme-RL", AutoProcessor, Qwen2_5_VLForConditionalGeneration, model_key="Thyme-RL")
 # --- Preprocessing and Helper Functions ---
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
-    """Add random padding to an image."""
     image = image.convert("RGB")
     width, height = image.size
-    pad_w = int(width * random.uniform(min_percent, max_percent))
-    pad_h = int(height * random.uniform(min_percent, max_percent))
-    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=image.getpixel((0, 0)))
     return padded_image
-def downsample_video(video_path, num_frames=10):
-    """Downsample a video into a list of PIL Image frames."""
-    if not os.path.exists(video_path): return []
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     frames = []
-    if total_frames > 0:
-        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
-        for i in frame_indices:
-            vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-            success, image = vidcap.read()
-            if success:
-                frames.append(Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)))
     vidcap.release()
     return frames
-def format_history_for_model(history, selected_model):
-    """Prepares history for the multimodal model, handling text and media files."""
-    last_user_message = next((item for item in reversed(history) if item["role"] == "user"), None)
-    if not last_user_message:
-        return None, [], ""
-    text = ""
-    files = []
-    images = []
-    for content_part in last_user_message["content"]:
-        if content_part["type"] == "text":
-            text = content_part["content"]
-        elif content_part["type"] == "file":
-            files.extend(content_part["content"])
-    for file_path in files:
-        mime_type = client_utils.get_mimetype(file_path)
-        if mime_type.startswith("image"):
-            images.append(Image.open(file_path))
-        elif mime_type.startswith("video"):
-            images.extend(downsample_video(file_path))
-    # Apply model-specific preprocessing
-    if selected_model == "SmolDocling-256M-preview":
-        if "OTSL" in text or "code" in text:
-            images = [add_random_padding(img) for img in images]
-    return text, images, selected_model
 @spaces.GPU
-# --- Gradio Events and Application Logic ---
-class Gradio_Events:
-    @staticmethod
-    def submit(state_value):
-        conv_id = state_value["conversation_id"]
-        context = state_value["conversation_contexts"][conv_id]
-        history = context["history"]
-        model_name = context.get("selected_model", MODEL_CHOICES[0])
-        processor = processors.get(model_name)
-        model = models.get(model_name)
-        if not processor or not model:
-            history.append({"role": "assistant", "content": [{"type": "text", "content": f"Error: Model '{model_name}' not loaded."}]})
-            yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
-            return
-        text, images, _ = format_history_for_model(history, model_name)
-        if not text and not images:
-             yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
-             return
-        history.append({
-            "role": "assistant",
-            "content": [],
-            "key": str(uuid.uuid4()),
-            "loading": True,
-        })
-        yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
-        try:
-            messages = [{"role": "user", "content": []}]
-            if images:
-                messages[0]["content"].extend([{"type": "image"}] * len(images))
-            messages[0]["content"].append({"type": "text", "text": text or "Describe the media."})
-            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-            inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-            streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-            generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": MAX_MAX_NEW_TOKENS}
-            thread = Thread(target=model.generate, kwargs=generation_kwargs)
-            thread.start()
-            buffer = ""
-            for new_text in streamer:
-                buffer += new_text.replace("<|im_end|>", "")
-                history[-1]["content"] = [{"type": "text", "content": buffer}]
-                history[-1]["loading"] = True
-                yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
-            history[-1]["loading"] = False
-            # Final post-processing, especially for models like SmolDocling
-            final_content = buffer.strip().replace("<end_of_utterance>", "")
-            history[-1]["content"] = [{"type": "text", "content": final_content}]
-            yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
-        except Exception as e:
-            print(f"Error during model generation: {e}")
-            history[-1]["loading"] = False
-            history[-1]["content"] = [{"type": "text", "content": f'<span style="color: red;">An error occurred: {e}</span>'}]
-            yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
-    @staticmethod
-    def add_message(input_value, state_value):
-        text = input_value["text"]
-        files = input_value["files"]
-        if not state_value["conversation_id"]:
-            random_id = str(uuid.uuid4())
-            state_value["conversation_id"] = random_id
-            state_value["conversations"].append({"label": text or "New Chat", "key": random_id})
-            state_value["conversation_contexts"][random_id] = {
-                "history": [],
-                "selected_model": MODEL_CHOICES[0] # Default model
-            }
-        conv_id = state_value["conversation_id"]
-        history = state_value["conversation_contexts"][conv_id]["history"]
-        history.append({
-            "key": str(uuid.uuid4()),
-            "role": "user",
-            "content": [{"type": "file", "content": files}, {"type": "text", "content": text}]
-        })
-        yield Gradio_Events.preprocess_submit(clear_input=True)(state_value)
-        for chunk in Gradio_Events.submit(state_value):
-            yield chunk
-        yield Gradio_Events.postprocess_submit(state_value)
-    @staticmethod
-    def preprocess_submit(clear_input=True):
-        def handler(state_value):
-            conv_id = state_value["conversation_id"]
-            history = state_value["conversation_contexts"][conv_id]["history"]
-            return {
-                input_comp: gr.update(value={'text': '', 'files': []} if clear_input else {}, loading=True),
-                conversations: gr.update(active_key=conv_id, items=state_value["conversations"]),
-                add_conversation_btn: gr.update(disabled=True),
-                chatbot: gr.update(value=history),
-                state: gr.update(value=state_value),
-            }
-        return handler
-    @staticmethod
-    def postprocess_submit(state_value):
-        conv_id = state_value["conversation_id"]
-        history = state_value["conversation_contexts"][conv_id]["history"]
-        return {
-            input_comp: gr.update(loading=False),
-            add_conversation_btn: gr.update(disabled=False),
-            chatbot: gr.update(value=history),
-            state: gr.update(value=state_value),
-        }
-    @staticmethod
-    def apply_prompt(e: gr.EventData):
-        # Example format: {"description": "Query text", "urls": ["path/to/image.png"]}
-        prompt_data = e._data["payload"][0]["value"]
-        return gr.update(value={'text': prompt_data['description'], 'files': prompt_data['urls']})
-    @staticmethod
-    def new_chat(state_value):
-        state_value["conversation_id"] = ""
-        return gr.update(active_key=""), gr.update(value=None), gr.update(value=state_value), gr.update(value=MODEL_CHOICES[0])
-    @staticmethod
-    def select_conversation(state_value, e: gr.EventData):
-        active_key = e._data["payload"][0]
-        if state_value["conversation_id"] == active_key or active_key not in state_value["conversation_contexts"]:
-            return gr.skip()
-        state_value["conversation_id"] = active_key
-        context = state_value["conversation_contexts"][active_key]
-        return gr.update(active_key=active_key), gr.update(value=context["history"]), gr.update(value=state_value), gr.update(value=context.get("selected_model", MODEL_CHOICES[0]))
-    @staticmethod
-    def on_model_change(model_name, state_value):
-        if state_value["conversation_id"]:
-            state_value["conversation_contexts"][state_value["conversation_id"]]["selected_model"] = model_name
-        return state_value
-# --- UI Layout and Components ---
 css = """
-.gradio-container { padding: 0 !important; }
-main.fillable { padding: 0 !important; }
-#chatbot_container { height: calc(100vh - 80px); max-height: 1000px; }
-#conversations_sidebar .chatbot-conversations {
-  height: 100vh; background-color: var(--ms-gr-ant-color-bg-layout); padding: 8px;
-}
-#main_chat_area { padding: 16px; height: 100%; }
 """
-# Define welcome prompts based on available examples
-welcome_prompts = [
-    {
-        "title": "Reconstruct Table",
-        "description": "Reconstruct the doc [table] as it is.",
-        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/0.png"]
-    },
-    {
-        "title": "Describe Image",
-        "description": "Describe the image!",
-        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/8.png"]
-    },
-    {
-        "title": "OCR Image",
-        "description": "OCR the image",
-        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/2.jpg"]
-    },
-    {
-        "title": "Convert to Docling",
-        "description": "Convert this page to docling",
-        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/1.png"]
-    },
-]
-with gr.Blocks(css=css, fill_width=True, title="Multimodal OCR2") as demo:
-    state = gr.State({
-        "conversation_contexts": {},
-        "conversations": [],
-        "conversation_id": "",
-    })
-    with ms.Application(), antdx.XProvider(), ms.AutoLoading():
-        with antd.Row(gutter=[0, 0], wrap=False, elem_id="chatbot_container"):
-            # Left Sidebar for Conversations
-            with antd.Col(md=dict(flex="0 0 260px"), elem_id="conversations_sidebar"):
-                with ms.Div(elem_classes="chatbot-conversations"):
-                    with antd.Flex(vertical=True, gap="small", elem_style=dict(height="100%")):
-                        gr.Markdown("### OCR Conversations")
-                        with antd.Button(color="primary", variant="filled", block=True) as add_conversation_btn:
-                            ms.Text("New Conversation")
-                            with ms.Slot("icon"): antd.Icon("PlusOutlined")
-                        with antdx.Conversations() as conversations:
-                            pass # Handled by events
-            # Right Main Chat Area
-            with antd.Col(flex=1, elem_style=dict(height="100%")):
-                with antd.Flex(vertical=True, gap="small", elem_id="main_chat_area"):
-                    gr.Markdown("## Multimodal OCR2")
-                    chatbot = pro.Chatbot(
-                        height="calc(100vh - 200px)",
-                        # FIX: The `prompts` key now holds a dictionary for categorization
-                        welcome_config={
-                            "prompts": {
-                                "Examples": welcome_prompts
-                            },
-                            "title": "Start by selecting an example:"
-                        }
-                    )
-                    with pro.MultimodalInput(placeholder="Ask a question about your image or video...") as input_comp:
-                        with ms.Slot("prefix"):
-                            model_selector = gr.Dropdown(
-                                choices=MODEL_CHOICES,
-                                value=MODEL_CHOICES[0],
-                                label="Select Model",
-                                container=False
-                            )
-    # --- Event Wiring ---
-    add_conversation_btn.click(
-        fn=Gradio_Events.new_chat,
-        inputs=[state],
-        outputs=[conversations, chatbot, state, model_selector]
-    )
-    conversations.active_change(
-        fn=Gradio_Events.select_conversation,
-        inputs=[state],
-        outputs=[conversations, chatbot, state, model_selector]
-    )
-    chatbot.welcome_prompt_select(
-        fn=Gradio_Events.apply_prompt,
-        inputs=[],
-        outputs=[input_comp]
     )
-    submit_event = input_comp.submit(
-        fn=Gradio_Events.add_message,
-        inputs=[input_comp, state],
-        outputs=[input_comp, add_conversation_btn, conversations, chatbot, state]
     )
-    model_selector.change(
-        fn=Gradio_Events.on_model_change,
-        inputs=[model_selector, state],
-        outputs=[state]
     )
 if __name__ == "__main__":
-    demo.queue().launch(show_error=True, debug=True)

 import os
 import random
 import uuid
+import json
 import time
+import asyncio
 from threading import Thread
 import gradio as gr
 import cv2
 from transformers import (
+    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
+    AutoModelForCausalLM,
     AutoModelForVision2Seq,
     AutoProcessor,
     TextIteratorStreamer,
 )
+from transformers.image_utils import load_image
+# These imports seem to be from a custom library.
+# If you have 'docling_core' installed, you can uncomment them.
+# from docling_core.types.doc import DoclingDocument, DocTagsDocument
+import re
+import ast
+import html
+# --- Constants ---
 MAX_MAX_NEW_TOKENS = 5120
 DEFAULT_MAX_NEW_TOKENS = 3072
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # --- Model Loading ---
+# Load Nanonets-OCR-s
+MODEL_ID_M = "nanonets/Nanonets-OCR-s"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_M,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load MonkeyOCR
+MODEL_ID_G = "echo840/MonkeyOCR"
+SUBFOLDER = "Recognition"
+processor_g = AutoProcessor.from_pretrained(
+    MODEL_ID_G,
+    trust_remote_code=True,
+    subfolder=SUBFOLDER
+)
+model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_G,
+    trust_remote_code=True,
+    subfolder=SUBFOLDER,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load Typhoon-OCR-7B
+MODEL_ID_L = "scb10x/typhoon-ocr-7b"
+processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
+model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_L,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Load SmolDocling-256M-preview
+MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
+processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+model_x = AutoModelForVision2Seq.from_pretrained(
+    MODEL_ID_X,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
+# Thyme-RL
+MODEL_ID_N = "Kwai-Keye/Thyme-RL"
+processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
+model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_N,
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+).to(device).eval()
 # --- Preprocessing and Helper Functions ---
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
+    """Add random padding to an image based on its size."""
     image = image.convert("RGB")
     width, height = image.size
+    pad_w_percent = random.uniform(min_percent, max_percent)
+    pad_h_percent = random.uniform(min_percent, max_percent)
+    pad_w = int(width * pad_w_percent)
+    pad_h = int(height * pad_h_percent)
+    corner_pixel = image.getpixel((0, 0))  # Top-left corner
+    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
     return padded_image
+def normalize_values(text, target_max=500):
+    """Normalize numerical values in text to a target maximum."""
+    def normalize_list(values):
+        max_value = max(values) if values else 1
+        return [round((v / max_value) * target_max) for v in values]
+    def process_match(match):
+        num_list = ast.literal_eval(match.group(0))
+        normalized = normalize_list(num_list)
+        return "".join([f"<loc_{num}>" for num in normalized])
+    pattern = r"\[([\d\.\s,]+)\]"
+    normalized_text = re.sub(pattern, process_match, text)
+    return normalized_text
+def downsample_video(video_path):
+    """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
+    # Use 10 frames for video processing
+    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
+    for i in frame_indices:
+        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+        success, image = vidcap.read()
+        if success:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            pil_image = Image.fromarray(image)
+            timestamp = round(i / fps, 2)
+            frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
+# A placeholder function in case docling_core is not installed
+def format_smoldocling_output(buffer_text, images):
+    cleaned_output = buffer_text.replace("<end_of_utterance>", "").strip()
+    # Check if docling_core is available and was imported
+    if 'DocTagsDocument' in globals() and 'DoclingDocument' in globals():
+        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
+            if "<chart>" in cleaned_output:
+                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
+                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
+            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
+            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
+            markdown_output = doc.export_to_markdown()
+            return markdown_output
+    # Fallback if library is not available or tags are not present
+    return cleaned_output
+# --- Core Generation Logic ---
+def get_model_and_processor(model_name):
+    """Helper to select model and processor."""
+    if model_name == "Nanonets-OCR-s":
+        return processor_m, model_m
+    elif model_name == "MonkeyOCR-Recognition":
+        return processor_g, model_g
+    elif model_name == "SmolDocling-256M-preview":
+        return processor_x, model_x
+    elif model_name == "Typhoon-OCR-7B":
+        return processor_l, model_l
+    elif model_name == "Thyme-RL":
+        return processor_n, model_n
+    else:
+        return None, None
+def is_video_file(filepath):
+    """Check if a file has a common video extension."""
+    if not filepath:
+        return False
+    video_extensions = ['.mp4', '.mov', '.avi', '.mkv', '.webm']
+    return any(filepath.lower().endswith(ext) for ext in video_extensions)
 @spaces.GPU
+def generate_response(
+    media_file: str,
+    query: str,
+    model_name: str,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float
+):
+    """Unified generation function for both image and video."""
+    if media_file is None:
+        yield "Please upload an image or video file first.", "Please upload an image or video file first."
+        return
+    processor, model = get_model_and_processor(model_name)
+    if not processor or not model:
+        yield "Invalid model selected.", "Invalid model selected."
+        return
+    media_type = "video" if is_video_file(media_file) else "image"
+    if media_type == "video":
+        frames = downsample_video(media_file)
+        images = [frame for frame, _ in frames]
+    else: # image
+        images = [Image.open(media_file)]
+    if model_name == "SmolDocling-256M-preview":
+        if "OTSL" in query or "code" in query:
+            images = [add_random_padding(img) for img in images]
+        if "OCR at text at" in query or "Identify element" in query or "formula" in query:
+            query = normalize_values(query, target_max=500)
+    messages = [
+        {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": query}]}
+    ]
+    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text.replace("<|im_end|>", "")
+        yield buffer
+    if model_name == "SmolDocling-256M-preview":
+        formatted_output = format_smoldocling_output(buffer, images)
+        yield formatted_output
+    else:
+        yield buffer.strip()
+# --- Gradio Interface ---
+# --- Examples ---
+image_examples = [
+    ["images/0.png", "Reconstruct the doc [table] as it is."],
+    ["images/8.png", "Describe the image!"],
+    ["images/2.jpg", "OCR the image"],
+    ["images/1.png", "Convert this page to docling"],
+    ["images/3.png", "Convert this page to docling"],
+    ["images/4.png", "Convert chart to OTSL."],
+    ["images/5.jpg", "Convert code to text"],
+    ["images/6.jpg", "Convert this table to OTSL."],
+    ["images/7.jpg", "Convert formula to latex."],
+]
+video_examples = [
+    ["videos/1.mp4", "Explain the video in detail."],
+    ["videos/2.mp4", "Explain the video in detail."]
+]
+all_examples = image_examples + video_examples
+# --- UI Styling and Helper Functions ---
 css = """
+body, .gradio-container { font-family: 'Inter', sans-serif; }
+.main-container { padding: 20px; }
+.sidebar { background-color: #F7F7F7; border-right: 1px solid #E0E0E0; padding: 15px; border-radius: 15px; }
+.chat-window { min-height: 60vh; border: 1px solid #E0E0E0; border-radius: 15px; padding: 20px; box-shadow: 0 4px 8px rgba(0,0,0,0.05); }
+.input-bar { padding: 10px; border-radius: 15px; background-color: #FFFFFF; border: 1px solid #E0E0E0; margin-top: 20px;}
+.submit-button { background-color: #007AFF !important; color: white !important; font-weight: bold !important; }
+.media-display {text-align: center; background-color: #F0F0F0; border-radius: 10px; padding: 10px; margin-bottom: 20px;}
+.media-display img, .media-display video {max-height: 400px; margin: auto;}
 """
+def handle_file_upload(file):
+    if file is None:
+        return None, gr.update(visible=False)
+    if is_video_file(file.name):
+        return gr.update(value=file.name, visible=False), gr.update(value=file.name, visible=True)
+    else:
+        return gr.update(value=file.name, visible=True), gr.update(value=file.name, visible=False)
+def clear_all():
+    return None, None, None, ""
+with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
+    # Hidden state to store the path to the uploaded file
+    media_file_path = gr.State(None)
+    with gr.Row(elem_classes="main-container"):
+        # --- Sidebar ---
+        with gr.Column(scale=1, elem_classes="sidebar"):
+            gr.Markdown("### OCR Conversations")
+            add_conv_btn = gr.Button("+ Add Conversation")
+            gr.Markdown("---")
+            gr.Markdown("#### Advanced Options")
+            with gr.Accordion("⚙️ Generation Settings", open=False):
+                max_new_tokens = gr.Slider(
+                    label="Max New Tokens",
+                    minimum=256,
+                    maximum=MAX_MAX_NEW_TOKENS,
+                    step=64,
+                    value=DEFAULT_MAX_NEW_TOKENS,
+                )
+                temperature = gr.Slider(
+                    label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.6
+                )
+                top_p = gr.Slider(
+                    label="Top-p", minimum=0.1, maximum=1.0, step=0.05, value=0.9
+                )
+        # --- Main Content Panel ---
+        with gr.Column(scale=4):
+            gr.Markdown("# Multimodal OCR")
+            # --- Media Display Area ---
+            with gr.Column(elem_classes="media-display"):
+                image_display = gr.Image(type="filepath", label="Image Preview", visible=False)
+                video_display = gr.Video(label="Video Preview", visible=False)
+                gr.Markdown("Upload an image or video to begin.")
+            # --- Examples ---
+            gr.Examples(
+                examples=all_examples,
+                inputs=[media_file_path, "query_input"],
+                label="Examples (Click to run)",
+                fn=handle_file_upload, # Custom function to update media display
+                outputs=[image_display, video_display]
+            )
+            # --- Chat/Output Window ---
+            output_display = gr.Markdown(elem_classes="chat-window", value="### Output will be shown here")
+            # --- Input Bar ---
+            with gr.Row(elem_classes="input-bar", vertical=False):
+                upload_btn = gr.UploadButton("📁 Add Files", file_types=["image", "video"])
+                model_dropdown = gr.Dropdown(
+                    choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
+                    label="Select Model",
+                    value="Nanonets-OCR-s"
+                )
+                query_input = gr.Textbox(
+                    placeholder="Enter your query here...",
+                    show_label=False,
+                    scale=4,
+                )
+                submit_btn = gr.Button("▶", elem_classes="submit-button")
+    # --- Event Handlers ---
+    upload_btn.upload(
+        fn=handle_file_upload,
+        inputs=[upload_btn],
+        outputs=[image_display, video_display]
     )
+    # When file is uploaded, also store its path in the state
+    upload_btn.upload(lambda f: f.name if f else None, upload_btn, media_file_path)
+    submit_btn.click(
+        fn=generate_response,
+        inputs=[media_file_path, query_input, model_dropdown, max_new_tokens, temperature, top_p],
+        outputs=[output_display]
     )
+    add_conv_btn.click(
+        fn=clear_all,
+        outputs=[media_file_path, image_display, video_display, output_display]
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, show_error=True)