Spaces:

sergiopaniego
/

vlm_object_understanding

Running on Zero

File size: 15,439 Bytes

1b98b3b
bba3f6f
1b98b3b
 
b83d741
 
 
1b98b3b
b83d741
 
 
f793138
b83d741
 
1b98b3b
b83d741
1b98b3b
f793138
 
36f3f37
f793138
 
36f3f37
 
 
 
b83d741
1b98b3b
 
b83d741
36f3f37
 
 
b83d741
f793138
 
36f3f37
 
f793138
b83d741
1b98b3b
 
 
b83d741
 
1b98b3b
 
 
 
 
 
 
cfcfd51
 
1b98b3b
 
 
 
 
 
 
cfcfd51
 
1b98b3b
b83d741
 
 
 
 
 
 
 
cfcfd51
 
 
 
b83d741
cfcfd51
 
b83d741
cfcfd51
 
 
 
 
 
b83d741
cfcfd51
 
b83d741
cfcfd51
b83d741
cfcfd51
 
 
 
 
1b98b3b
cfcfd51
 
b83d741
cfcfd51
36f3f37
 
 
 
cfcfd51
 
 
 
 
 
 
 
b83d741
36f3f37
 
 
 
 
b83d741
cfcfd51
 
 
 
 
b83d741
 
 
cfcfd51
 
b83d741
 
 
 
cfcfd51
 
b83d741
cfcfd51
36f3f37
cfcfd51
b83d741
cfcfd51
 
b83d741
cfcfd51
 
 
36f3f37
f793138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1b98b3b
36f3f37
1b98b3b
 
 
 
 
 
 
 
 
 
bba3f6f
f793138
 
 
 
 
 
36f3f37
1b98b3b
36f3f37
1b98b3b
b83d741
 
1b98b3b
36f3f37
b83d741
 
 
1b98b3b
bba3f6f
1b98b3b
f793138
1b98b3b
bba3f6f
f793138
bba3f6f
1b98b3b
36f3f37
 
 
bba3f6f
36f3f37
 
 
 
 
b83d741
 
 
bba3f6f
36f3f37
b83d741
 
 
36f3f37
bba3f6f
 
 
b83d741
36f3f37
 
 
 
b83d741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36f3f37
1b98b3b
 
 
 
 
 
8f86518
36f3f37
b83d741
 
 
1b98b3b
f793138
 
1b98b3b
 
b553066
36f3f37
b553066
36f3f37
 
b83d741
36f3f37
 
 
 
b83d741
36f3f37
 
 
 
 
 
 
b83d741
36f3f37
 
 
b83d741
36f3f37
b553066
 
 
b83d741
 
 
 
 
 
bba3f6f
b83d741
36f3f37
b83d741
 
 
 
 
 
 
 
bba3f6f
1b98b3b
b553066
 
b83d741
 
f793138
b83d741
 
 
 
 
f793138
b83d741
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f793138
b83d741
 
 
 
 
 
 
 
 
 
 
f793138
b83d741
 
 
 
 
f793138
b83d741
 
 
b553066
 
f793138
 
 
 
 
 
 
 
b553066
1b98b3b
bba3f6f
 
b83d741
 
 
 
 
 
bba3f6f
b83d741
 
 
 
 
 
 
bba3f6f
b83d741
1b98b3b
f793138

import json
import time

import gradio as gr
import numpy as np
from gradio.themes.ocean import Ocean
from PIL import Image
from qwen_vl_utils import process_vision_info
from transformers import (
    AutoModelForCausalLM,
    AutoProcessor,
    Qwen3VLForConditionalGeneration,
)

from spaces import GPU
import supervision as sv

model_qwen_id = "Qwen/Qwen3-VL-4B-Instruct"
model_moondream_id = "moondream/moondream3-preview"

model_qwen = Qwen3VLForConditionalGeneration.from_pretrained(
    model_qwen_id, torch_dtype="auto", device_map="auto",
)
model_moondream = AutoModelForCausalLM.from_pretrained(
    model_moondream_id,
    trust_remote_code=True,
    device_map={"": "cuda"},
)


def extract_model_short_name(model_id):
    return model_id.split("/")[-1].replace("-", " ").replace("_", " ")


model_qwen_name = extract_model_short_name(model_qwen_id)  
model_moondream_name = extract_model_short_name(model_moondream_id)  


processor_qwen = AutoProcessor.from_pretrained(model_qwen_id)


def create_annotated_image(image, json_data, height, width):
    try:
        parsed_json_data = json_data.split("```json")[1].split("```")[0]
        bbox_data = json.loads(parsed_json_data)
    except Exception:
        return image

    original_width, original_height = image.size
    x_scale = original_width / width
    y_scale = original_height / height

    points = []
    point_labels = []

    for item in bbox_data:
        label = item.get("label", "")
        if "point_2d" in item:
            x, y = item["point_2d"]
            scaled_x = int(x * x_scale)
            scaled_y = int(y * y_scale)
            points.append([scaled_x, scaled_y])
            point_labels.append(label)

        annotated_image = np.array(image.convert("RGB"))

        detections = sv.Detections.from_vlm(vlm = sv.VLM.QWEN_2_5_VL,
                                            result=json_data,
                                            input_wh=(original_width,
                                                      original_height),
                                            resolution_wh=(original_width,
                                                           original_height))
        bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)

        annotated_image = bounding_box_annotator.annotate(
            scene=annotated_image, detections=detections
        )
        annotated_image = label_annotator.annotate(
            scene=annotated_image, detections=detections
        )

    if points:
        points_array = np.array(points).reshape(1, -1, 2)
        key_points = sv.KeyPoints(xy=points_array)
        vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.BLUE)
        # vertex_label_annotator = sv.VertexLabelAnnotator(text_scale=0.5, border_radius=2)

        annotated_image = vertex_annotator.annotate(
            scene=annotated_image, key_points=key_points
        )

        # annotated_image = vertex_label_annotator.annotate(
        #     scene=annotated_image,
        #     key_points=key_points,
        #     labels=point_labels
        # )

    return Image.fromarray(annotated_image)


def create_annotated_image_normalized(image, json_data, label="object"):
    if not isinstance(json_data, dict):
        return image

    original_width, original_height = image.size
    annotated_image = np.array(image.convert("RGB"))

    points = []
    if "points" in json_data:
        for point in json_data.get("points", []):
            x = int(point["x"] * original_width)
            y = int(point["y"] * original_height)
            points.append([x, y])

    if "reasoning" in json_data:
        for grounding in json_data["reasoning"].get("grounding", []):
            for x_norm, y_norm in grounding.get("points", []):
                x = int(x_norm * original_width)
                y = int(y_norm * original_height)
                points.append([x, y])

    if points:
        points_array = np.array(points).reshape(1, -1, 2)
        key_points = sv.KeyPoints(xy=points_array)
        vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
        annotated_image = vertex_annotator.annotate(
            scene=annotated_image, key_points=key_points
        )

    if "objects" in json_data:
        detections = sv.Detections.from_vlm(sv.VLM.MOONDREAM,json_data,
                                            resolution_wh=(original_width,
                                                           original_height))

        bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
        label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)

        labels = [label for _ in detections.xyxy]

        annotated_image = bounding_box_annotator.annotate(
            scene=annotated_image, detections=detections
        )
        annotated_image = label_annotator.annotate(
            scene=annotated_image, detections=detections, labels=labels
        )

    return Image.fromarray(annotated_image)


def parse_qwen3_json(json_output):
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])  
            json_output = json_output.split("```")[0]  
            break 
    
    try:
        boxes = json.loads(json_output)
    except json.JSONDecodeError:
        end_idx = json_output.rfind('"}') + len('"}')
        truncated_text = json_output[:end_idx] + "]"
        boxes = json.loads(truncated_text)
    
    if not isinstance(boxes, list):
        boxes = [boxes]
    
    return boxes


def create_annotated_image_qwen3(image, json_output):
    try:
        boxes = parse_qwen3_json(json_output)
    except Exception as e:
        print(f"Error parsing JSON: {e}")
        return image
    
    if not boxes:
        return image
    
    original_width, original_height = image.size
    annotated_image = np.array(image.convert("RGB"))
    
    xyxy = []
    labels = []
    
    for box in boxes:
        if "bbox_2d" in box and "label" in box:
            x1, y1, x2, y2 = box["bbox_2d"]
            scale = 1000
            x1 = max(0, min(scale, x1)) / scale * original_width
            y1 = max(0, min(scale, y1)) / scale * original_height
            x2 = max(0, min(scale, x2)) / scale * original_width
            y2 = max(0, min(scale, y2)) / scale * original_height
            # Ensure x1 <= x2 and y1 <= y2
            if x1 > x2: x1, x2 = x2, x1
            if y1 > y2: y1, y2 = y2, y1
            xyxy.append([int(x1), int(y1), int(x2), int(y2)])
            labels.append(box["label"])
    
    if not xyxy:
        return image
    
    detections = sv.Detections(
        xyxy=np.array(xyxy),
        class_id=np.arange(len(xyxy))
    )
    
    bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
    label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)
    
    annotated_image = bounding_box_annotator.annotate(
        scene=annotated_image, detections=detections
    )
    annotated_image = label_annotator.annotate(
        scene=annotated_image, detections=detections, labels=labels
    )
    
    return Image.fromarray(annotated_image)


@GPU
def detect_qwen(image, prompt):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": prompt},
            ],
        }
    ]

    t0 = time.perf_counter()
    inputs = processor_qwen.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    ).to(model_qwen.device)

    generated_ids = model_qwen.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :]
        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor_qwen.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]
    elapsed_ms = (time.perf_counter() - t0) * 1_000

    annotated_image = create_annotated_image_qwen3(image, output_text)

    time_taken = f"**Inference time ({model_qwen_name}):** {elapsed_ms:.0f} ms"
    
    return annotated_image, output_text, time_taken


@GPU
def detect_moondream(image, prompt, category_input):
    t0 = time.perf_counter()
    if category_input in ["Object Detection", "Visual Grounding + Object Detection"]:
        output_text = model_moondream.detect(image=image, object=prompt)
    elif category_input == "Visual Grounding + Keypoint Detection":
        output_text = model_moondream.point(image=image, object=prompt)
    else:
        output_text = model_moondream.query(
            image=image, question=prompt, reasoning=True
        )
    elapsed_ms = (time.perf_counter() - t0) * 1_000

    annotated_image = create_annotated_image_normalized(
        image=image, json_data=output_text, label="object"
    )

    time_taken = f"**Inference time ({model_moondream_name}):** {elapsed_ms:.0f} ms"
    return annotated_image, output_text, time_taken


def detect(image, prompt_model_1, prompt_model_2, category_input):
    STANDARD_SIZE = (1024, 1024)
    image.thumbnail(STANDARD_SIZE)

    annotated_image_model_1, output_text_model_1, timing_1 = detect_qwen(
        image, prompt_model_1
    )
    annotated_image_model_2, output_text_model_2, timing_2 = detect_moondream(
        image, prompt_model_2, category_input
    )

    return (
        annotated_image_model_1,
        output_text_model_1,
        timing_1,
        annotated_image_model_2,
        output_text_model_2,
        timing_2,
    )


css_hide_share = """
button#gradio-share-link-button-0 {
    display: none !important;
}
"""

with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
    gr.Markdown("# 👓 Object Understanding with Vision Language Models")
    gr.Markdown(
        "### Explore object detection, visual grounding, keypoint detection, and/or object counting through natural language prompts."
    )
    gr.Markdown("""
    *Powered by [Qwen3-VL 4B](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) and [Moondream 3 Preview](https://huggingface.co/moondream/moondream3-preview). Inspired by the tutorial [Object Detection and Visual Grounding with Qwen 2.5](https://pyimagesearch.com/2025/06/09/object-detection-and-visual-grounding-with-qwen-2-5/) on PyImageSearch.*
    *Moondream 3 uses the [moondream-preview](https://huggingface.co/vikhyatk/moondream2/blob/main/moondream.py), selecting `detect` for categories with "Object Detection" `point` for the ones with "Keypoint Detection", and reasoning-based querying for all others.*
    """)

    with gr.Row():
        with gr.Column(scale=2):
            image_input = gr.Image(label="Upload an image", type="pil", height=400)
            prompt_input_model_1 = gr.Textbox(
                label=f"Enter your prompt for {model_qwen_name}",
                placeholder="e.g., Detect all red cars in the image",
            )

            prompt_input_model_2 = gr.Textbox(
                label=f"Enter your prompt for {model_moondream_name}",
                placeholder="e.g., Detect all blue cars in the image",
            )

            categories = [
                "Object Detection",
                "Object Counting",
                "Visual Grounding + Keypoint Detection",
                "Visual Grounding + Object Detection",
                "General query",
            ]

            category_input = gr.Dropdown(
                choices=categories, label="Category", interactive=True
            )
            generate_btn = gr.Button(value="Generate")

        with gr.Column(scale=1):
            output_image_model_1 = gr.Image(
                type="pil", label=f"Annotated image for {model_qwen_name}", height=400
            )
            output_textbox_model_1 = gr.Textbox(
                label=f"Model response for {model_qwen_name}", lines=10
            )
            output_time_model_1 = gr.Markdown()

        with gr.Column(scale=1):
            output_image_model_2 = gr.Image(
                type="pil",
                label=f"Annotated image for {model_moondream_name}",
                height=400,
            )
            output_textbox_model_2 = gr.Textbox(
                label=f"Model response for {model_moondream_name}", lines=10
            )
            output_time_model_2 = gr.Markdown()

    gr.Markdown("### Examples")
    example_prompts = [
        [
            "examples/example_1.jpg",
            "locate every instance in the image. Report bbox coordinates in JSON format.",
            "objects",
            "Object Detection",
        ],
        [
            "examples/example_2.JPG",
            'locate every instance that belongs to the following categories: "candy, hand". Report bbox coordinates in JSON format.',
            "candies",
            "Object Detection",
        ],
        [
            "examples/example_1.jpg",
            "Count the number of red cars in the image.",
            "Count the number of red cars in the image.",
            "Object Counting",
        ],
        [
            "examples/example_2.JPG",
            "Count the number of blue candies in the image.",
            "Count the number of blue candies in the image.",
            "Object Counting",
        ],
        [
            "examples/example_1.jpg",
            'locate every instance that belongs to the following categories: "red car". Report bbox coordinates in JSON format..',
            "red cars",
            "Visual Grounding + Keypoint Detection",
        ],
        [
            "examples/example_2.JPG",
            "Identify the blue candies in this image, detect their key points and return their positions in the form of points.",
            "blue candies",
            "Visual Grounding + Keypoint Detection",
        ],
        [
            "examples/example_1.jpg",
            'locate every instance that belongs to the following categories: "leading red car". Report bbox coordinates in JSON format..',
            "leading red car",
            "Visual Grounding + Object Detection",
        ],
        [
            "examples/example_2.JPG",
            'locate every instance that belongs to the following categories: "blue candy located at the top of the group". Report bbox coordinates in JSON format.',
            "blue candy located at the top of the group",
            "Visual Grounding + Object Detection",
        ],
    ]
    gr.Examples(
         examples=example_prompts,
         inputs=[
             image_input,
             prompt_input_model_1,
             prompt_input_model_2,
             category_input,
         ],
         label="Click an example to populate the input",
    )

    generate_btn.click(
        fn=detect,
        inputs=[
            image_input,
            prompt_input_model_1,
            prompt_input_model_2,
            category_input,
        ],
        outputs=[
            output_image_model_1,
            output_textbox_model_1,
            output_time_model_1,
            output_image_model_2,
            output_textbox_model_2,
            output_time_model_2,
        ],
    )

if __name__ == "__main__":
    demo.launch()