Spaces:

prithivMLmods
/

Multimodal-OCR2

Running on Zero

App Files Files Community

prithivMLmods commited on Sep 25

Commit

e61207c

verified ·

1 Parent(s): 25a44d8

Update app.py

Browse files

Files changed (1) hide show

app.py +334 -297

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
 import random
 import uuid
-import json
 import time
-import asyncio
 from threading import Thread
 import gradio as gr
@@ -14,336 +14,373 @@ from PIL import Image, ImageOps
 import cv2
 from transformers import (
-    Qwen2VLForConditionalGeneration,
     Qwen2_5_VLForConditionalGeneration,
-    AutoModelForCausalLM,
     AutoModelForVision2Seq,
     AutoProcessor,
     TextIteratorStreamer,
 )
-from transformers.image_utils import load_image
-# These imports seem to be from a custom library.
-# If you have 'docling_core' installed, you can uncomment them.
-# from docling_core.types.doc import DoclingDocument, DocTagsDocument
-import re
-import ast
-import html
-# Constants for text generation
 MAX_MAX_NEW_TOKENS = 5120
 DEFAULT_MAX_NEW_TOKENS = 3072
-MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # --- Model Loading ---
-# Load Nanonets-OCR-s
-MODEL_ID_M = "nanonets/Nanonets-OCR-s"
-processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
-model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_M,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load MonkeyOCR
-MODEL_ID_G = "echo840/MonkeyOCR"
-SUBFOLDER = "Recognition"
-processor_g = AutoProcessor.from_pretrained(
-    MODEL_ID_G,
-    trust_remote_code=True,
-    subfolder=SUBFOLDER
-)
-model_g = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_G,
-    trust_remote_code=True,
-    subfolder=SUBFOLDER,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load Typhoon-OCR-7B
-MODEL_ID_L = "scb10x/typhoon-ocr-7b"
-processor_l = AutoProcessor.from_pretrained(MODEL_ID_L, trust_remote_code=True)
-model_l = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_L,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load SmolDocling-256M-preview
-MODEL_ID_X = "ds4sd/SmolDocling-256M-preview"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = AutoModelForVision2Seq.from_pretrained(
-    MODEL_ID_X,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Thyme-RL
-MODEL_ID_N = "Kwai-Keye/Thyme-RL"
-processor_n = AutoProcessor.from_pretrained(MODEL_ID_N, trust_remote_code=True)
-model_n = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_N,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
 # --- Preprocessing and Helper Functions ---
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
-    """Add random padding to an image based on its size."""
     image = image.convert("RGB")
     width, height = image.size
-    pad_w_percent = random.uniform(min_percent, max_percent)
-    pad_h_percent = random.uniform(min_percent, max_percent)
-    pad_w = int(width * pad_w_percent)
-    pad_h = int(height * pad_h_percent)
-    corner_pixel = image.getpixel((0, 0))  # Top-left corner
-    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=corner_pixel)
     return padded_image
-def normalize_values(text, target_max=500):
-    """Normalize numerical values in text to a target maximum."""
-    def normalize_list(values):
-        max_value = max(values) if values else 1
-        return [round((v / max_value) * target_max) for v in values]
-    def process_match(match):
-        num_list = ast.literal_eval(match.group(0))
-        normalized = normalize_list(num_list)
-        return "".join([f"<loc_{num}>" for num in normalized])
-    pattern = r"\[([\d\.\s,]+)\]"
-    normalized_text = re.sub(pattern, process_match, text)
-    return normalized_text
-def downsample_video(video_path):
-    """Downsample a video to evenly spaced frames, returning PIL images with timestamps."""
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Use 10 frames for video processing
-    frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
-    for i in frame_indices:
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
     vidcap.release()
     return frames
-# A placeholder function in case docling_core is not installed
-def format_smoldocling_output(buffer_text, images):
-    cleaned_output = buffer_text.replace("<end_of_utterance>", "").strip()
-    # Check if docling_core is available and was imported
-    if 'DocTagsDocument' in globals() and 'DoclingDocument' in globals():
-        if any(tag in cleaned_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
-            if "<chart>" in cleaned_output:
-                cleaned_output = cleaned_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
-                cleaned_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', cleaned_output)
-            doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([cleaned_output], images)
-            doc = DoclingDocument.load_from_doctags(doctags_doc, document_name="Document")
-            markdown_output = doc.export_to_markdown()
-            return buffer_text, markdown_output
-    # Fallback if library is not available or tags are not present
-    return buffer_text, cleaned_output
-# --- Core Generation Logic ---
-def get_model_and_processor(model_name):
-    """Helper to select model and processor."""
-    if model_name == "Nanonets-OCR-s":
-        return processor_m, model_m
-    elif model_name == "MonkeyOCR-Recognition":
-        return processor_g, model_g
-    elif model_name == "SmolDocling-256M-preview":
-        return processor_x, model_x
-    elif model_name == "Typhoon-OCR-7B":
-        return processor_l, model_l
-    elif model_name == "Thyme-RL":
-        return processor_n, model_n
-    else:
-        return None, None
-@spaces.GPU
-def generate_response(model_name: str, text: str, media_input, media_type: str,
-                      max_new_tokens: int, temperature: float, top_p: float, top_k: int, repetition_penalty: float):
-    """Unified generation function for both image and video."""
-    processor, model = get_model_and_processor(model_name)
-    if not processor or not model:
-        yield "Invalid model selected.", "Invalid model selected."
-        return
-    if media_input is None:
-        yield f"Please upload a {media_type}.", f"Please upload a {media_type}."
-        return
-    if media_type == "video":
-        frames = downsample_video(media_input)
-        images = [frame for frame, _ in frames]
-    else: # image
-        images = [media_input]
-    if model_name == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             images = [add_random_padding(img) for img in images]
-        if "OCR at text at" in text or "Identify element" in text or "formula" in text:
-            text = normalize_values(text, target_max=500)
-    messages = [
-        {"role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}]}
-    ]
-    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
-    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text.replace("<|im_end|>", "")
-        yield buffer, buffer
-    if model_name == "SmolDocling-256M-preview":
-        raw_output, formatted_output = format_smoldocling_output(buffer, images)
-        yield raw_output, formatted_output
-    else:
-        # For other models, the formatted output is just the cleaned buffer
-        yield buffer, buffer.strip()
-def generate_image_wrapper(*args):
-    yield from generate_response(*args, media_type="image")
-def generate_video_wrapper(*args):
-    yield from generate_response(*args, media_type="video")
-# --- Examples ---
-image_examples = [
-    ["Reconstruct the doc [table] as it is.", "images/0.png"],
-    ["Describe the image!", "images/8.png"],
-    ["OCR the image", "images/2.jpg"],
-    ["Convert this page to docling", "images/1.png"],
-    ["Convert this page to docling", "images/3.png"],
-    ["Convert chart to OTSL.", "images/4.png"],
-    ["Convert code to text", "images/5.jpg"],
-    ["Convert this table to OTSL.", "images/6.jpg"],
-    ["Convert formula to latex.", "images/7.jpg"],
-]
-video_examples = [
-    ["Explain the video in detail.", "videos/1.mp4"],
-    ["Explain the video in detail.", "videos/2.mp4"]
-]
-# --- UI Styling ---
 css = """
-.submit-btn {
-    background-color: #2980b9 !important;
-    color: white !important;
-    border: none !important;
-    box-shadow: 2px 2px 5px rgba(0,0,0,0.2) !important;
-}
-.submit-btn:hover {
-    background-color: #3498db !important;
-    box-shadow: 2px 2px 8px rgba(0,0,0,0.3) !important;
-}
-.canvas-output {
-    border: 2px solid #4682B4;
-    border-radius: 10px;
-    padding: 20px;
-    background-color: #f0f8ff;
 }
 """
-# --- Gradio Interface ---
-with gr.Blocks(css=css) as demo:
-    gr.Markdown("# **[Multimodal OCR2](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
-    with gr.Row():
-        # Left Column for Inputs and Controls
-        with gr.Column(scale=1):
-            with gr.Tabs():
-                with gr.TabItem("🖼️ Image Inference"):
-                    image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    image_upload = gr.Image(type="pil", label="Upload Image", height=300)
-                    gr.Examples(
-                        examples=image_examples,
-                        inputs=[image_query, image_upload],
-                        label="Image Examples"
-                    )
-                    image_submit = gr.Button("Submit", elem_classes="submit-btn")
-                with gr.TabItem("🎥 Video Inference"):
-                    video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
-                    video_upload = gr.Video(label="Upload Video", height=300)
-                    gr.Examples(
-                        examples=video_examples,
-                        inputs=[video_query, video_upload],
-                        label="Video Examples"
                     )
-                    video_submit = gr.Button("Submit", elem_classes="submit-btn")
-            with gr.Accordion("⚙️ Advanced Options", open=False):
-                max_new_tokens = gr.Slider(label="Max New Tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
-                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
-                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
-                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-        # Right Column for Outputs and Model Info
-        with gr.Column(scale=1):
-            with gr.Column(elem_classes="canvas-output"):
-                gr.Markdown("## Output")
-                raw_output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=5)
-                with gr.Accordion("📄 Formatted Result (Result.md)", open=True):
-                    formatted_output = gr.Markdown(label="Formatted Output")
-            model_choice = gr.Radio(
-                choices=["Nanonets-OCR-s", "MonkeyOCR-Recognition", "Thyme-RL", "Typhoon-OCR-7B", "SmolDocling-256M-preview"],
-                label="🤖 Select Model",
-                value="Nanonets-OCR-s"
-            )
-            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/discussions)")
-            gr.Markdown("> **[Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s)**: A powerful, state-of-the-art image-to-markdown OCR model that transforms documents into structured markdown with intelligent content recognition.")
-            gr.Markdown("> **[SmolDocling-256M](https://huggingface.co/ds4sd/SmolDocling-256M-preview)**: A multimodal Image-Text-to-Text model designed for efficient document conversion, retaining key features of the larger Docling model.")
-            gr.Markdown("> **[MonkeyOCR-Recognition](https://huggingface.co/echo840/MonkeyOCR)**: Adopts a Structure-Recognition-Relation (SRR) paradigm, simplifying the pipeline for document processing.")
-            gr.Markdown("> **[Typhoon-OCR-7B](https://huggingface.co/scb10x/typhoon-ocr-7b)**: A bilingual document parsing model for real-world documents in Thai and English, capable of extracting text from images and charts.")
-            gr.Markdown("> **[Thyme-RL](https://huggingface.co/Kwai-Keye/Thyme-RL)**: Thyme transcends traditional 'thinking with images' by autonomously generating and executing code for image processing and computation, enhancing performance on complex reasoning tasks.")
-            gr.Markdown("> ⚠️ **Note**: All models in this space are primarily optimized for image tasks and may not perform as well on video inference use cases.")
-    # --- Event Handlers ---
-    common_inputs = [model_choice, max_new_tokens, temperature, top_p, top_k, repetition_penalty]
-    common_outputs = [raw_output, formatted_output]
-    image_submit.click(
-        fn=generate_image_wrapper,
-        inputs=[image_query, image_upload] + common_inputs,
-        outputs=common_outputs
     )
-    video_submit.click(
-        fn=generate_video_wrapper,
-        inputs=[video_query, video_upload] + common_inputs,
-        outputs=common_outputs
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, show_error=True)

 import os
 import random
 import uuid
 import time
+import base64
+from http import HTTPStatus
 from threading import Thread
 import gradio as gr
 import cv2
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoModelForVision2Seq,
     AutoProcessor,
     TextIteratorStreamer,
 )
+from gradio_client import utils as client_utils
+import modelscope_studio.components.antd as antd
+import modelscope_studio.components.antdx as antdx
+import modelscope_studio.components.base as ms
+import modelscope_studio.components.pro as pro
+# --- Constants and Configuration ---
 MAX_MAX_NEW_TOKENS = 5120
 DEFAULT_MAX_NEW_TOKENS = 3072
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # --- Model Loading ---
+# A dictionary to hold our models and processors for easy access
+models = {}
+processors = {}
+MODEL_CHOICES = [
+    "Nanonets-OCR-s",
+    "MonkeyOCR-Recognition",
+    "Thyme-RL",
+    "Typhoon-OCR-7B",
+    "SmolDocling-256M-preview"
+]
+def load_model(model_id, processor_class, model_class, subfolder=None, model_key=''):
+    """Helper function to load a model and processor."""
+    print(f"Loading model: {model_key}...")
+    try:
+        processor_args = {"trust_remote_code": True}
+        model_args = {"trust_remote_code": True, "torch_dtype": torch.float16}
+        if subfolder:
+            processor_args["subfolder"] = subfolder
+            model_args["subfolder"] = subfolder
+        processors[model_key] = processor_class.from_pretrained(model_id, **processor_args)
+        models[model_key] = model_class.from_pretrained(model_id, **model_args).to(device).eval()
+        print(f"Successfully loaded {model_key}.")
+    except Exception as e:
+        print(f"Error loading model {model_key}: {e}")
+        # If a model fails to load, remove it from the choices
+        if model_key in MODEL_CHOICES:
+            MODEL_CHOICES.remove(model_key)
+# Load all models
+load_model("nanonets/Nanonets-OCR-s", AutoProcessor, Qwen2_5_VLForConditionalGeneration, model_key="Nanonets-OCR-s")
+load_model("echo840/MonkeyOCR", AutoProcessor, Qwen2_5_VLForConditionalGeneration, subfolder="Recognition", model_key="MonkeyOCR-Recognition")
+load_model("scb10x/typhoon-ocr-7b", AutoProcessor, Qwen2_5_VLForConditionalGeneration, model_key="Typhoon-OCR-7B")
+load_model("ds4sd/SmolDocling-256M-preview", AutoProcessor, AutoModelForVision2Seq, model_key="SmolDocling-256M-preview")
+load_model("Kwai-Keye/Thyme-RL", AutoProcessor, Qwen2_5_VLForConditionalGeneration, model_key="Thyme-RL")
 # --- Preprocessing and Helper Functions ---
 def add_random_padding(image, min_percent=0.1, max_percent=0.10):
+    """Add random padding to an image."""
     image = image.convert("RGB")
     width, height = image.size
+    pad_w = int(width * random.uniform(min_percent, max_percent))
+    pad_h = int(height * random.uniform(min_percent, max_percent))
+    padded_image = ImageOps.expand(image, border=(pad_w, pad_h, pad_w, pad_h), fill=image.getpixel((0, 0)))
     return padded_image
+def downsample_video(video_path, num_frames=10):
+    """Downsample a video into a list of PIL Image frames."""
+    if not os.path.exists(video_path): return []
     vidcap = cv2.VideoCapture(video_path)
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     frames = []
+    if total_frames > 0:
+        frame_indices = np.linspace(0, total_frames - 1, num_frames, dtype=int)
+        for i in frame_indices:
+            vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
+            success, image = vidcap.read()
+            if success:
+                frames.append(Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)))
     vidcap.release()
     return frames
+def format_history_for_model(history, selected_model):
+    """Prepares history for the multimodal model, handling text and media files."""
+    last_user_message = next((item for item in reversed(history) if item["role"] == "user"), None)
+    if not last_user_message:
+        return None, [], ""
+    text = ""
+    files = []
+    images = []
+    for content_part in last_user_message["content"]:
+        if content_part["type"] == "text":
+            text = content_part["content"]
+        elif content_part["type"] == "file":
+            files.extend(content_part["content"])
+    for file_path in files:
+        mime_type = client_utils.get_mimetype(file_path)
+        if mime_type.startswith("image"):
+            images.append(Image.open(file_path))
+        elif mime_type.startswith("video"):
+            images.extend(downsample_video(file_path))
+    # Apply model-specific preprocessing
+    if selected_model == "SmolDocling-256M-preview":
         if "OTSL" in text or "code" in text:
             images = [add_random_padding(img) for img in images]
+    return text, images, selected_model
+# --- Gradio Events and Application Logic ---
+class Gradio_Events:
+    @staticmethod
+    def submit(state_value):
+        conv_id = state_value["conversation_id"]
+        context = state_value["conversation_contexts"][conv_id]
+        history = context["history"]
+        model_name = context.get("selected_model", MODEL_CHOICES[0])
+        processor = processors.get(model_name)
+        model = models.get(model_name)
+        if not processor or not model:
+            history.append({"role": "assistant", "content": [{"type": "text", "content": f"Error: Model '{model_name}' not loaded."}]})
+            yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
+            return
+        text, images, _ = format_history_for_model(history, model_name)
+        if not text and not images:
+             yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
+             return
+        history.append({
+            "role": "assistant",
+            "content": [],
+            "key": str(uuid.uuid4()),
+            "loading": True,
+        })
+        yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
+        try:
+            messages = [{"role": "user", "content": []}]
+            if images:
+                messages[0]["content"].extend([{"type": "image"}] * len(images))
+            messages[0]["content"].append({"type": "text", "text": text or "Describe the media."})
+            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+            inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
+            streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+            generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": MAX_MAX_NEW_TOKENS}
+            thread = Thread(target=model.generate, kwargs=generation_kwargs)
+            thread.start()
+            buffer = ""
+            for new_text in streamer:
+                buffer += new_text.replace("<|im_end|>", "")
+                history[-1]["content"] = [{"type": "text", "content": buffer}]
+                history[-1]["loading"] = True
+                yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
+            history[-1]["loading"] = False
+            # Final post-processing, especially for models like SmolDocling
+            final_content = buffer.strip().replace("<end_of_utterance>", "")
+            history[-1]["content"] = [{"type": "text", "content": final_content}]
+            yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
+        except Exception as e:
+            print(f"Error during model generation: {e}")
+            history[-1]["loading"] = False
+            history[-1]["content"] = [{"type": "text", "content": f'<span style="color: red;">An error occurred: {e}</span>'}]
+            yield {chatbot: gr.update(value=history), state: gr.update(value=state_value)}
+    @staticmethod
+    def add_message(input_value, state_value):
+        text = input_value["text"]
+        files = input_value["files"]
+        if not state_value["conversation_id"]:
+            random_id = str(uuid.uuid4())
+            state_value["conversation_id"] = random_id
+            state_value["conversations"].append({"label": text or "New Chat", "key": random_id})
+            state_value["conversation_contexts"][random_id] = {
+                "history": [],
+                "selected_model": MODEL_CHOICES[0] # Default model
+            }
+        conv_id = state_value["conversation_id"]
+        history = state_value["conversation_contexts"][conv_id]["history"]
+        history.append({
+            "key": str(uuid.uuid4()),
+            "role": "user",
+            "content": [{"type": "file", "content": files}, {"type": "text", "content": text}]
+        })
+        yield Gradio_Events.preprocess_submit(clear_input=True)(state_value)
+        for chunk in Gradio_Events.submit(state_value):
+            yield chunk
+        yield Gradio_Events.postprocess_submit(state_value)
+    @staticmethod
+    def preprocess_submit(clear_input=True):
+        def handler(state_value):
+            conv_id = state_value["conversation_id"]
+            history = state_value["conversation_contexts"][conv_id]["history"]
+            return {
+                input_comp: gr.update(value={'text': '', 'files': []} if clear_input else {}, loading=True),
+                conversations: gr.update(active_key=conv_id, items=state_value["conversations"]),
+                add_conversation_btn: gr.update(disabled=True),
+                chatbot: gr.update(value=history),
+                state: gr.update(value=state_value),
+            }
+        return handler
+    @staticmethod
+    def postprocess_submit(state_value):
+        conv_id = state_value["conversation_id"]
+        history = state_value["conversation_contexts"][conv_id]["history"]
+        return {
+            input_comp: gr.update(loading=False),
+            add_conversation_btn: gr.update(disabled=False),
+            chatbot: gr.update(value=history),
+            state: gr.update(value=state_value),
+        }
+    @staticmethod
+    def apply_prompt(e: gr.EventData):
+        # Example format: {"description": "Query text", "urls": ["path/to/image.png"]}
+        prompt_data = e._data["payload"][0]["value"]
+        return gr.update(value={'text': prompt_data['description'], 'files': prompt_data['urls']})
+    @staticmethod
+    def new_chat(state_value):
+        state_value["conversation_id"] = ""
+        return gr.update(active_key=""), gr.update(value=None), gr.update(value=state_value), gr.update(value=MODEL_CHOICES[0])
+    @staticmethod
+    def select_conversation(state_value, e: gr.EventData):
+        active_key = e._data["payload"][0]
+        if state_value["conversation_id"] == active_key or active_key not in state_value["conversation_contexts"]:
+            return gr.skip()
+        state_value["conversation_id"] = active_key
+        context = state_value["conversation_contexts"][active_key]
+        return gr.update(active_key=active_key), gr.update(value=context["history"]), gr.update(value=state_value), gr.update(value=context.get("selected_model", MODEL_CHOICES[0]))
+    @staticmethod
+    def on_model_change(model_name, state_value):
+        if state_value["conversation_id"]:
+            state_value["conversation_contexts"][state_value["conversation_id"]]["selected_model"] = model_name
+        return state_value
+# --- UI Layout and Components ---
 css = """
+.gradio-container { padding: 0 !important; }
+main.fillable { padding: 0 !important; }
+#chatbot_container { height: calc(100vh - 80px); max-height: 1000px; }
+#conversations_sidebar .chatbot-conversations {
+  height: 100vh; background-color: var(--ms-gr-ant-color-bg-layout); padding: 8px;
 }
+#main_chat_area { padding: 16px; height: 100%; }
 """
+# Define welcome prompts based on available examples
+welcome_prompts = [
+    {
+        "title": "Reconstruct Table",
+        "description": "Reconstruct the doc [table] as it is.",
+        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/0.png"]
+    },
+    {
+        "title": "Describe Image",
+        "description": "Describe the image!",
+        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/8.png"]
+    },
+    {
+        "title": "OCR Image",
+        "description": "OCR the image",
+        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/2.jpg"]
+    },
+    {
+        "title": "Convert to Docling",
+        "description": "Convert this page to docling",
+        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/1.png"]
+    },
+    {
+        "title": "Convert Chart",
+        "description": "Convert chart to OTSL.",
+        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/4.png"]
+    },
+    {
+        "title": "Extract Code",
+        "description": "Convert code to text",
+        "urls": ["https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR2/resolve/main/images/5.jpg"]
+    },
+]
+with gr.Blocks(css=css, fill_width=True, title="Multimodal OCR2") as demo:
+    state = gr.State({
+        "conversation_contexts": {},
+        "conversations": [],
+        "conversation_id": "",
+    })
+    with ms.Application(), antdx.XProvider(), ms.AutoLoading():
+        with antd.Row(gutter=[0, 0], wrap=False, elem_id="chatbot_container"):
+            # Left Sidebar for Conversations
+            with antd.Col(md=dict(flex="0 0 260px"), elem_id="conversations_sidebar"):
+                with ms.Div(elem_classes="chatbot-conversations"):
+                    with antd.Flex(vertical=True, gap="small", elem_style=dict(height="100%")):
+                        gr.Markdown("### OCR Conversations")
+                        with antd.Button(color="primary", variant="filled", block=True) as add_conversation_btn:
+                            ms.Text("New Conversation")
+                            with ms.Slot("icon"): antd.Icon("PlusOutlined")
+                        with antdx.Conversations() as conversations:
+                            pass # Handled by events
+            # Right Main Chat Area
+            with antd.Col(flex=1, elem_style=dict(height="100%")):
+                with antd.Flex(vertical=True, gap="small", elem_id="main_chat_area"):
+                    gr.Markdown("## Multimodal OCR2")
+                    chatbot = pro.Chatbot(
+                        height="calc(100vh - 200px)",
+                        welcome_config=pro.Chatbot.WelcomeConfig(prompts=welcome_prompts, title="Start by selecting an example:")
                     )
+                    with pro.MultimodalInput(placeholder="Ask a question about your image or video...") as input_comp:
+                        with ms.Slot("prefix"):
+                            model_selector = gr.Dropdown(
+                                choices=MODEL_CHOICES,
+                                value=MODEL_CHOICES[0],
+                                label="Select Model",
+                                container=False
+                            )
+    # --- Event Wiring ---
+    add_conversation_btn.click(
+        fn=Gradio_Events.new_chat,
+        inputs=[state],
+        outputs=[conversations, chatbot, state, model_selector]
     )
+    conversations.active_change(
+        fn=Gradio_Events.select_conversation,
+        inputs=[state],
+        outputs=[conversations, chatbot, state, model_selector]
+    )
+    chatbot.welcome_prompt_select(
+        fn=Gradio_Events.apply_prompt,
+        inputs=[],
+        outputs=[input_comp]
+    )
+    submit_event = input_comp.submit(
+        fn=Gradio_Events.add_message,
+        inputs=[input_comp, state],
+        outputs=[input_comp, add_conversation_btn, conversations, chatbot, state]
+    )
+    model_selector.change(
+        fn=Gradio_Events.on_model_change,
+        inputs=[model_selector, state],
+        outputs=[state]
     )
 if __name__ == "__main__":
+    demo.queue().launch(show_error=True, debug=True)