Spaces:

prithivMLmods
/

Multimodal-VLM-v1.0

Running on Zero

App Files Files Community

prithivMLmods commited on Oct 3

Commit

063e299

verified ·

1 Parent(s): be8b851

Update app.py

Browse files

Files changed (1) hide show

app.py +370 -179

app.py CHANGED Viewed

@@ -11,13 +11,13 @@ from threading import Thread
 from io import BytesIO
 import uuid
 import tempfile
 import gradio as gr
-import requests
 import torch
 from PIL import Image
-import fitz
-import numpy as np
 from transformers import (
@@ -26,33 +26,29 @@ from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
-    AutoTokenizer,
 )
-from transformers.image_utils import load_image as hf_load_image
-from reportlab.lib.pagesizes import A4
-from reportlab.lib.styles import getSampleStyleSheet
-from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
-from reportlab.lib.units import inch
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("torch.__version__ =", torch.__version__)
 print("torch.version.cuda =", torch.version.cuda)
-print("cuda available:", torch.cuda.is_available())
-print("cuda device count:", torch.cuda.device_count())
 if torch.cuda.is_available():
-    print("current device:", torch.cuda.current_device())
-    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 print("Using device:", device)
 # --- Model Loading ---
 # Load Camel-Doc-OCR-062825
 MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-062825"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -60,8 +56,10 @@ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 # MinerU2.5-2509
 MODEL_ID_T = "opendatalab/MinerU2.5-2509-1.2B"
 processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
 model_t = Qwen2VLForConditionalGeneration.from_pretrained(
@@ -69,8 +67,11 @@ model_t = Qwen2VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 # Load Video-MTR
 MODEL_ID_S = "Phoebe13/Video-MTR"
 processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
 model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@@ -78,8 +79,10 @@ model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# moondream3
 MODEL_ID_MD3 = "moondream/moondream3-preview"
 model_md3 = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_MD3,
@@ -87,79 +90,228 @@ model_md3 = AutoModelForCausalLM.from_pretrained(
     torch_dtype=torch.bfloat16,
     device_map={"": "cuda"},
 )
-# FIX: Added trust_remote_code=True to resolve the loading error
-tokenizer_md3 = AutoTokenizer.from_pretrained(MODEL_ID_MD3, trust_remote_code=True)
-# --- PDF Generation and Preview Utility Function ---
-def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
-    """
-    Generates a PDF, saves it, and then creates image previews of its pages.
-    Returns the path to the PDF and a list of paths to the preview images.
-    """
-    if image is None or not text_content or not text_content.strip():
-        raise gr.Error("Cannot generate PDF. Image or text content is missing.")
-    # --- 1. Generate the PDF ---
-    temp_dir = tempfile.gettempdir()
-    pdf_filename = os.path.join(temp_dir, f"output_{uuid.uuid4()}.pdf")
-    doc = SimpleDocTemplate(
-        pdf_filename,
-        pagesize=A4,
-        rightMargin=inch, leftMargin=inch,
-        topMargin=inch, bottomMargin=inch
     )
-    styles = getSampleStyleSheet()
-    style_normal = styles["Normal"]
-    style_normal.fontSize = int(font_size)
-    style_normal.leading = int(font_size) * line_spacing
-    style_normal.alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
-    story = []
-    img_buffer = BytesIO()
-    image.save(img_buffer, format='PNG')
-    img_buffer.seek(0)
-    page_width, _ = A4
-    available_width = page_width - 2 * inch
-    image_widths = {
-        "Small": available_width * 0.3,
-        "Medium": available_width * 0.6,
-        "Large": available_width * 0.9,
-    }
-    img_width = image_widths[image_size]
-    img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))
-    story.append(img)
-    story.append(Spacer(1, 12))
-    cleaned_text = re.sub(r'#+\s*', '', text_content).replace("*", "")
-    text_paragraphs = cleaned_text.split('\n')
-    for para in text_paragraphs:
-        if para.strip():
-            story.append(Paragraph(para, style_normal))
-    doc.build(story)
-    # --- 2. Render PDF pages as images for preview ---
-    preview_images = []
-    try:
-        pdf_doc = fitz.open(pdf_filename)
-        for page_num in range(len(pdf_doc)):
-            page = pdf_doc.load_page(page_num)
-            pix = page.get_pixmap(dpi=150)
-            preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
-            pix.save(preview_img_path)
-            preview_images.append(preview_img_path)
-        pdf_doc.close()
-    except Exception as e:
-        print(f"Error generating PDF preview: {e}")
-    return pdf_filename, preview_images
-# --- Core Application Logic ---
 @spaces.GPU
 def process_document_stream(
     model_name: str,
@@ -172,39 +324,27 @@ def process_document_stream(
     repetition_penalty: float
 ):
     """
-    Main generator function that handles model inference tasks with advanced generation parameters.
     """
     if image is None:
-        yield "Please upload an image.", ""
         return
     if not prompt_input or not prompt_input.strip():
-        yield "Please enter a prompt.", ""
         return
-    # --- Special Handling for Moondream3 ---
-    if model_name == "Moondream3":
-        # Moondream3 uses a different prompt structure and doesn't stream by default in this implementation
-        prompt_full = f"<image>\n\nQuestion: {prompt_input}\n\nAnswer:"
-        answer = model_md3.answer_question(
-            model_md3.encode_image(image),
-            prompt_full,
-            tokenizer=tokenizer_md3
-        )
-        yield answer, answer
-        return
-    processor = None
-    model = None
-    # --- Generic Handling for all other models ---
-    if model_name == "Camel-Doc-OCR-062825": processor, model = processor_m, model_m
-    elif model_name == "MinerU2.5-2509-1.2B": processor, model = processor_t, model_t
-    elif model_name == "Video-MTR": processor, model = processor_s, model_s
     else:
-        yield "Invalid model selected.", ""
         return
-    messages = [{"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt_input}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
@@ -218,7 +358,7 @@ def process_document_stream(
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
-        "do_sample": True
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
@@ -227,12 +367,10 @@ def process_document_stream(
     buffer = ""
     for new_text in streamer:
         buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
         time.sleep(0.01)
-        yield buffer , buffer
-    yield buffer, buffer
 # --- Gradio UI Definition ---
 def create_gradio_interface():
@@ -241,89 +379,142 @@ def create_gradio_interface():
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
-    #gallery { min-height: 400px; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
-        gr.HTML("""
-        <div class="title" style="text-align: center">
-            <h1>Multimodal VLM v1.0</h1>
-            <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
-                Multimodal VLM for Image Content Extraction and Understanding
-            </p>
-        </div>
-        """)
-        with gr.Row():
-            # Left Column (Inputs)
-            with gr.Column(scale=1):
-                model_choice = gr.Dropdown(
-                    choices=["Camel-Doc-OCR-062825", "MinerU2.5-2509-1.2B", "Video-MTR", "Moondream3"],
-                    label="Select Model", value= "Camel-Doc-OCR-062825"
-                )
-                prompt_input = gr.Textbox(label="Query Input", placeholder="✦︎ Enter the prompt")
-                image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
-                with gr.Accordion("Advanced Settings", open=False):
-                    max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
-                    temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
-                    top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
-                    top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
-                    repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
-                    gr.Markdown("### PDF Export Settings")
-                    font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
-                    line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
-                    alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Text Alignment")
-                    image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
-                process_btn = gr.Button("🚀 Process Image", variant="primary", elem_classes=["process-button"], size="lg")
-                clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
-            # Right Column (Outputs)
-            with gr.Column(scale=2):
-                with gr.Tabs() as tabs:
-                    with gr.Tab("📝 Extracted Content"):
-                        raw_output_stream = gr.Textbox(label="Raw Model Output Stream", interactive=False, lines=15, show_copy_button=True)
                         with gr.Row():
-                            examples = gr.Examples(
-                                examples=["examples/1.png", "examples/2.png", "examples/3.png",
-                                          "examples/4.png", "examples/5.png", "examples/6.png"],
-                                inputs=image_input, label="Examples"
-                            )
-                        gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/Multimodal-VLM-v1.0/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
-                    with gr.Tab("📰 README.md"):
-                        with gr.Accordion("(Result.md)", open=True):
-                            markdown_output = gr.Markdown()
-                    with gr.Tab("📋 PDF Preview"):
-                        generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
-                        pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
-                        pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
-        # Event Handlers
-        def clear_all_outputs():
-            return None, "", "Raw output will appear here.", "", None, None
         process_btn.click(
             fn=process_document_stream,
-            inputs=[model_choice, image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
-            outputs=[raw_output_stream, markdown_output]
         )
-        generate_pdf_btn.click(
-            fn=generate_and_preview_pdf,
-            inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],
-            outputs=[pdf_output_file, pdf_preview_gallery]
         )
-        clear_btn.click(
-            clear_all_outputs,
-            outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
         )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
-    demo.queue(max_size=50).launch(share=True, ssr_mode=False, show_error=True)

 from io import BytesIO
 import uuid
 import tempfile
+import cv2
 import gradio as gr
+import numpy as np
 import torch
 from PIL import Image
+import supervision as sv
 from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     TextIteratorStreamer,
 )
 # --- Constants and Model Setup ---
 MAX_INPUT_TOKEN_LENGTH = 4096
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("--- System Information ---")
 print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("torch.__version__ =", torch.__version__)
 print("torch.version.cuda =", torch.version.cuda)
+print("CUDA available:", torch.cuda.is_available())
+print("CUDA device count:", torch.cuda.device_count())
 if torch.cuda.is_available():
+    print("Current device:", torch.cuda.current_device())
+    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 print("Using device:", device)
+print("--------------------------")
 # --- Model Loading ---
 # Load Camel-Doc-OCR-062825
+print("Loading Camel-Doc-OCR-062825...")
 MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-062825"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+print("Camel-Doc-OCR-062825 loaded.")
 # MinerU2.5-2509
+print("Loading MinerU2.5-2509...")
 MODEL_ID_T = "opendatalab/MinerU2.5-2509-1.2B"
 processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
 model_t = Qwen2VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+print("MinerU2.5-2509 loaded.")
 # Load Video-MTR
+print("Loading Video-MTR...")
 MODEL_ID_S = "Phoebe13/Video-MTR"
 processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
 model_s = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
+print("Video-MTR loaded.")
+# Load moondream3
+print("Loading moondream3-preview...")
 MODEL_ID_MD3 = "moondream/moondream3-preview"
 model_md3 = AutoModelForCausalLM.from_pretrained(
     MODEL_ID_MD3,
     torch_dtype=torch.bfloat16,
     device_map={"": "cuda"},
 )
+model_md3.compile()
+print("moondream3-preview loaded and compiled.")
+# --- Moondream3 Utility Functions ---
+def create_annotated_image(image, detection_result, object_name="Object"):
+    if not isinstance(detection_result, dict) or "objects" not in detection_result:
+        return image
+    original_width, original_height = image.size
+    annotated_image = np.array(image.convert("RGB"))
+    bboxes = []
+    labels = []
+    for i, obj in enumerate(detection_result["objects"]):
+        x_min = int(obj["x_min"] * original_width)
+        y_min = int(obj["y_min"] * original_height)
+        x_max = int(obj["x_max"] * original_width)
+        y_max = int(obj["y_max"] * original_height)
+        x_min = max(0, min(x_min, original_width))
+        y_min = max(0, min(y_min, original_height))
+        x_max = max(0, min(x_max, original_width))
+        y_max = max(0, min(y_max, original_height))
+        if x_max > x_min and y_max > y_min:
+            bboxes.append([x_min, y_min, x_max, y_max])
+            labels.append(f"{object_name} {i+1}")
+    if not bboxes:
+        return image
+    detections = sv.Detections(
+        xyxy=np.array(bboxes, dtype=np.float32),
+        class_id=np.arange(len(bboxes))
+    )
+    bounding_box_annotator = sv.BoxAnnotator(
+        thickness=3,
+        color_lookup=sv.ColorLookup.INDEX
     )
+    label_annotator = sv.LabelAnnotator(
+        text_thickness=2,
+        text_scale=0.6,
+        color_lookup=sv.ColorLookup.INDEX
+    )
+    annotated_image = bounding_box_annotator.annotate(
+        scene=annotated_image, detections=detections
+    )
+    annotated_image = label_annotator.annotate(
+        scene=annotated_image, detections=detections, labels=labels
+    )
+    return Image.fromarray(annotated_image)
+@spaces.GPU()
+def process_video_with_tracking(video_path, prompt, detection_interval=3):
+    cap = cv2.VideoCapture(video_path)
+    fps = int(cap.get(cv2.CAP_PROP_FPS))
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    byte_tracker = sv.ByteTrack()
+    temp_dir = tempfile.mkdtemp()
+    output_path = os.path.join(temp_dir, "tracked_video.mp4")
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
+    frame_count = 0
+    detection_count = 0
+    try:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            run_detection = (frame_count % detection_interval == 0)
+            detections = sv.Detections.empty()
+            if run_detection:
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                pil_image = Image.fromarray(frame_rgb)
+                result = model_md3.detect(pil_image, prompt)
+                detection_count += 1
+                if "objects" in result and result["objects"]:
+                    bboxes = []
+                    confidences = []
+                    for obj in result["objects"]:
+                        x_min = max(0.0, min(1.0, obj["x_min"])) * width
+                        y_min = max(0.0, min(1.0, obj["y_min"])) * height
+                        x_max = max(0.0, min(1.0, obj["x_max"])) * width
+                        y_max = max(0.0, min(1.0, obj["y_max"])) * height
+                        if x_max > x_min and y_max > y_min:
+                            bboxes.append([x_min, y_min, x_max, y_max])
+                            confidences.append(0.8)
+                    if bboxes:
+                        detections = sv.Detections(
+                            xyxy=np.array(bboxes, dtype=np.float32),
+                            confidence=np.array(confidences, dtype=np.float32),
+                            class_id=np.zeros(len(bboxes), dtype=int)
+                        )
+            detections = byte_tracker.update_with_detections(detections)
+            if len(detections) > 0:
+                box_annotator = sv.BoxAnnotator(thickness=3, color_lookup=sv.ColorLookup.TRACK)
+                label_annotator = sv.LabelAnnotator(text_scale=0.6, text_thickness=2, color_lookup=sv.ColorLookup.TRACK)
+                labels = [f"{prompt} ID: {tracker_id}" for tracker_id in detections.tracker_id]
+                frame = box_annotator.annotate(scene=frame, detections=detections)
+                frame = label_annotator.annotate(scene=frame, detections=detections, labels=labels)
+            out.write(frame)
+            frame_count += 1
+            if frame_count % 30 == 0:
+                progress = (frame_count / total_frames) * 100
+                print(f"Processing: {progress:.1f}% ({frame_count}/{total_frames}) - Detections: {detection_count}")
+    finally:
+        cap.release()
+        out.release()
+    summary = f"""Video processing complete:
+- Total frames processed: {frame_count}
+- Detection runs: {detection_count} (every {detection_interval} frames)
+- Objects tracked: {prompt}
+- Processing speed: ~{detection_count/frame_count*100:.1f}% detection rate for optimization"""
+    return output_path, summary
+def create_point_annotated_image(image, point_result):
+    if not isinstance(point_result, dict) or "points" not in point_result:
+        return image
+    original_width, original_height = image.size
+    annotated_image = np.array(image.convert("RGB"))
+    points = []
+    for point in point_result["points"]:
+        x = int(point["x"] * original_width)
+        y = int(point["y"] * original_height)
+        points.append([x, y])
+    if points:
+        points_array = np.array(points).reshape(1, -1, 2)
+        key_points = sv.KeyPoints(xy=points_array)
+        vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
+        annotated_image = vertex_annotator.annotate(
+            scene=annotated_image, key_points=key_points
+        )
+    return Image.fromarray(annotated_image)
+@spaces.GPU()
+def detect_objects_md3(image, prompt, task_type, max_objects):
+    STANDARD_SIZE = (1024, 1024)
+    if image is None:
+        raise gr.Error("Please upload an image.")
+    image.thumbnail(STANDARD_SIZE)
+    t0 = time.perf_counter()
+    if task_type == "Object Detection":
+        settings = {"max_objects": max_objects} if max_objects > 0 else {}
+        result = model_md3.detect(image, prompt, settings=settings)
+        annotated_image = create_annotated_image(image, result, prompt)
+    elif task_type == "Point Detection":
+        result = model_md3.point(image, prompt)
+        annotated_image = create_point_annotated_image(image, result)
+    elif task_type == "Caption":
+        result = model_md3.caption(image, length="normal")
+        annotated_image = image
+    else:
+        result = model_md3.query(image=image, question=prompt, reasoning=True)
+        annotated_image = image
+    elapsed_ms = (time.perf_counter() - t0) * 1_000
+    if isinstance(result, dict):
+        if "objects" in result:
+          output_text = f"Found {len(result['objects'])} objects:\n"
+          for i, obj in enumerate(result['objects'], 1):
+              output_text += f"\n{i}. Bounding box: ({obj['x_min']:.3f}, {obj['y_min']:.3f}, {obj['x_max']:.3f}, {obj['y_max']:.3f})"
+        elif "points" in result:
+            output_text = f"Found {len(result['points'])} points:\n"
+            for i, point in enumerate(result['points'], 1):
+                output_text += f"\n{i}. Point: ({point['x']:.3f}, {point['y']:.3f})"
+        elif "caption" in result:
+            output_text = result['caption']
+        elif "answer" in result:
+            output_text = f"Reasoning: {result.get('reasoning', 'N/A')}\n\nAnswer: {result['answer']}"
+        else:
+            output_text = json.dumps(result, indent=2)
+    else:
+        output_text = str(result)
+    timing_text = f"Inference time: {elapsed_ms:.0f} ms"
+    return annotated_image, output_text, timing_text
+def process_video_md3(video_file, prompt, detection_interval):
+    if video_file is None:
+        return None, "Please upload a video file"
+    output_path, summary = process_video_with_tracking(video_file, prompt, detection_interval)
+    return output_path, summary
+# --- Core Application Logic (for other models) ---
 @spaces.GPU
 def process_document_stream(
     model_name: str,
     repetition_penalty: float
 ):
     """
+    Main generator function for models other than Moondream3.
     """
     if image is None:
+        yield "Please upload an image."
         return
     if not prompt_input or not prompt_input.strip():
+        yield "Please enter a prompt."
         return
+    # Select processor and model based on dropdown choice
+    if model_name == "Camel-Doc-OCR-062825 (OCR)":
+        processor, model = processor_m, model_m
+    elif model_name == "MinerU2.5-2509 (General)":
+        processor, model = processor_t, model_t
+    elif model_name == "Video-MTR (Video/Text)":
+        processor, model = processor_s, model_s
     else:
+        yield "Invalid model selected."
         return
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_input}]}]
     prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[prompt_full], images=[image], return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_TOKEN_LENGTH).to(device)
         "top_p": top_p,
         "top_k": top_k,
         "repetition_penalty": repetition_penalty,
+        "do_sample": True if temperature > 0 else False
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     buffer = ""
     for new_text in streamer:
         buffer += new_text
+        # Clean up potential model-specific tokens
+        buffer = buffer.replace("<|im_end|>", "").replace("</s>", "")
         time.sleep(0.01)
+        yield buffer
 # --- Gradio UI Definition ---
 def create_gradio_interface():
     .main-container { max-width: 1400px; margin: 0 auto; }
     .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
     .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
     """
     with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
+        gr.Markdown("# Multimodal VLM v1.0 🚀")
+        gr.Markdown("Explore the capabilities of various Vision Language Models for tasks like OCR, VQA, Object Detection, and Video Tracking.")
+        with gr.Tabs():
+            # --- TAB 1: Document and General VLMs ---
+            with gr.TabItem("📄 Document & General VLM"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.Markdown("### 1. Configure Inputs")
+                        model_choice = gr.Dropdown(
+                            choices=["Camel-Doc-OCR-062825 (OCR)", "MinerU2.5-2509 (General)", "Video-MTR (Video/Text)"],
+                            label="Select Model", value= "Camel-Doc-OCR-062825 (OCR)"
+                        )
+                        image_input_doc = gr.Image(label="Upload Image", type="pil", sources=['upload'])
+                        prompt_input_doc = gr.Textbox(label="Query Input", placeholder="e.g., 'Transcribe the text in this document.'")
+                        with gr.Accordion("Advanced Generation Settings", open=False):
+                            max_new_tokens = gr.Slider(minimum=256, maximum=4096, value=2048, step=128, label="Max New Tokens")
+                            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7)
+                            top_p = gr.Slider(label="Top-p", minimum=0.1, maximum=1.0, step=0.05, value=0.9)
+                            top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=40)
+                            repetition_penalty = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.1)
+                        process_btn = gr.Button("🚀 Process Image", variant="primary", elem_classes=["process-button"])
+                        clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+                    with gr.Column(scale=2):
+                        gr.Markdown("### 2. View Output")
+                        output_stream = gr.Textbox(label="Model Output", interactive=False, lines=20, show_copy_button=True)
+                gr.Examples(
+                    examples=[
+                        ["examples/1.png", "Transcribe this receipt."],
+                        ["examples/2.png", "Extract the table from this document as markdown."],
+                        ["examples/3.png", "What information is presented in this infographic?"],
+                    ],
+                    inputs=[image_input_doc, prompt_input_doc]
+                )
+            # --- TAB 2: Moondream3 Lab ---
+            with gr.TabItem("🌝 Moondream3 Lab"):
+                with gr.Tabs():
+                    with gr.TabItem("🖼️ Image Processing"):
                         with gr.Row():
+                            with gr.Column(scale=1):
+                                md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
+                                md3_task_type = gr.Radio(
+                                    choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
+                                    label="Task Type", value="Object Detection"
+                                )
+                                md3_prompt_input = gr.Textbox(
+                                    label="Prompt (object to detect/question to ask)",
+                                    placeholder="e.g., 'car', 'person', 'What's in this image?'", value="objects"
+                                )
+                                md3_max_objects = gr.Number(
+                                    label="Max Objects (for Object Detection only)",
+                                    value=10, minimum=1, maximum=50, step=1, visible=True
+                                )
+                                md3_generate_btn = gr.Button(value="✨ Generate", variant="primary")
+                            with gr.Column(scale=1):
+                                md3_output_image = gr.Image(type="pil", label="Result", height=400)
+                                md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
+                                md3_output_time = gr.Markdown()
+                        gr.Examples(
+                            examples=[
+                                ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG", "Object Detection", "candy", 5],
+                                ["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/candy.JPG", "Point Detection", "candy", 5],
+                                ["https://moondream.ai/images/blog/moondream-3-preview/benchmarks.jpg", "Caption", "", 5],
+                                ["https://moondream.ai/images/blog/moondream-3-preview/benchmarks.jpg", "Visual Question Answering", "how well does moondream 3 perform in chartvqa?", 5],
+                            ],
+                            inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
+                            label="Click an example to populate inputs"
+                        )
+                    with gr.TabItem("📹 Video Object Tracking"):
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                md3_video_input = gr.Video(label="Upload a video file", height=400)
+                                md3_video_prompt = gr.Textbox(label="Object to track", placeholder="e.g., 'person', 'car', 'ball'", value="person")
+                                md3_detection_interval = gr.Slider(
+                                    minimum=5, maximum=30, value=15, step=1, label="Detection Interval (frames)",
+                                    info="Run detection every N frames (lower is slower but more accurate)."
+                                )
+                                md3_process_video_btn = gr.Button(value="🎥 Process Video", variant="primary")
+                            with gr.Column(scale=1):
+                                md3_output_video = gr.Video(label="Tracked Video Result", height=400)
+                                md3_video_summary = gr.Textbox(label="Processing Summary", lines=8, show_copy_button=True)
+                        gr.Examples(
+                            examples=[["https://huggingface.co/datasets/merve/vlm_test_images/resolve/main/IMG_8137.mp4", "snowboarder", 15]],
+                            inputs=[md3_video_input, md3_video_prompt, md3_detection_interval],
+                            label="Click an example to populate inputs"
+                        )
+        # --- Event Handlers ---
+        # Document Tab
         process_btn.click(
             fn=process_document_stream,
+            inputs=[model_choice, image_input_doc, prompt_input_doc, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
+            outputs=[output_stream]
         )
+        clear_btn.click(lambda: (None, "", ""), outputs=[image_input_doc, prompt_input_doc, output_stream])
+        # Moondream3 Tab
+        def update_max_objects_visibility(task):
+            return gr.update(visible=(task == "Object Detection"))
+        md3_task_type.change(fn=update_max_objects_visibility, inputs=[md3_task_type], outputs=[md3_max_objects])
+        md3_generate_btn.click(
+            fn=detect_objects_md3,
+            inputs=[md3_image_input, md3_prompt_input, md3_task_type, md3_max_objects],
+            outputs=[md3_output_image, md3_output_textbox, md3_output_time]
         )
+        md3_process_video_btn.click(
+            fn=process_video_md3,
+            inputs=[md3_video_input, md3_video_prompt, md3_detection_interval],
+            outputs=[md3_output_video, md3_video_summary]
         )
     return demo
 if __name__ == "__main__":
+    # Create some example images if they don't exist
+    if not os.path.exists("examples"):
+        os.makedirs("examples")
+    try:
+        # Dummy image creation for examples to prevent errors if not present
+        Image.new('RGB', (200, 100), color = 'red').save('examples/1.png')
+        Image.new('RGB', (200, 100), color = 'green').save('examples/2.png')
+        Image.new('RGB', (200, 100), color = 'blue').save('examples/3.png')
+    except Exception as e:
+        print(f"Could not create dummy example images: {e}")
     demo = create_gradio_interface()
+    demo.queue(max_size=20).launch(share=True, show_error=True)