Spaces:

sergiopaniego
/

vlm_object_understanding

Running on Zero

App Files Files Community

vlm_object_understanding / app.py

merve HF Staff

Add Qwen3VL & moondream3

f793138 verified about 1 month ago

raw

history blame

15.4 kB

	import json
	import time

	import gradio as gr
	import numpy as np
	from gradio.themes.ocean import Ocean
	from PIL import Image
	from qwen_vl_utils import process_vision_info
	from transformers import (
	AutoModelForCausalLM,
	AutoProcessor,
	Qwen3VLForConditionalGeneration,
	)

	from spaces import GPU
	import supervision as sv

	model_qwen_id = "Qwen/Qwen3-VL-4B-Instruct"
	model_moondream_id = "moondream/moondream3-preview"

	model_qwen = Qwen3VLForConditionalGeneration.from_pretrained(
	model_qwen_id, torch_dtype="auto", device_map="auto",
	)
	model_moondream = AutoModelForCausalLM.from_pretrained(
	model_moondream_id,
	trust_remote_code=True,
	device_map={"": "cuda"},
	)


	def extract_model_short_name(model_id):
	return model_id.split("/")[-1].replace("-", " ").replace("_", " ")


	model_qwen_name = extract_model_short_name(model_qwen_id)
	model_moondream_name = extract_model_short_name(model_moondream_id)


	processor_qwen = AutoProcessor.from_pretrained(model_qwen_id)


	def create_annotated_image(image, json_data, height, width):
	try:
	parsed_json_data = json_data.split("```json")[1].split("```")[0]
	bbox_data = json.loads(parsed_json_data)
	except Exception:
	return image

	original_width, original_height = image.size
	x_scale = original_width / width
	y_scale = original_height / height

	points = []
	point_labels = []

	for item in bbox_data:
	label = item.get("label", "")
	if "point_2d" in item:
	x, y = item["point_2d"]
	scaled_x = int(x * x_scale)
	scaled_y = int(y * y_scale)
	points.append([scaled_x, scaled_y])
	point_labels.append(label)

	annotated_image = np.array(image.convert("RGB"))

	detections = sv.Detections.from_vlm(vlm = sv.VLM.QWEN_2_5_VL,
	result=json_data,
	input_wh=(original_width,
	original_height),
	resolution_wh=(original_width,
	original_height))
	bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
	label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)

	annotated_image = bounding_box_annotator.annotate(
	scene=annotated_image, detections=detections
	)
	annotated_image = label_annotator.annotate(
	scene=annotated_image, detections=detections
	)

	if points:
	points_array = np.array(points).reshape(1, -1, 2)
	key_points = sv.KeyPoints(xy=points_array)
	vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.BLUE)
	# vertex_label_annotator = sv.VertexLabelAnnotator(text_scale=0.5, border_radius=2)

	annotated_image = vertex_annotator.annotate(
	scene=annotated_image, key_points=key_points
	)

	# annotated_image = vertex_label_annotator.annotate(
	# scene=annotated_image,
	# key_points=key_points,
	# labels=point_labels
	# )

	return Image.fromarray(annotated_image)


	def create_annotated_image_normalized(image, json_data, label="object"):
	if not isinstance(json_data, dict):
	return image

	original_width, original_height = image.size
	annotated_image = np.array(image.convert("RGB"))

	points = []
	if "points" in json_data:
	for point in json_data.get("points", []):
	x = int(point["x"] * original_width)
	y = int(point["y"] * original_height)
	points.append([x, y])

	if "reasoning" in json_data:
	for grounding in json_data["reasoning"].get("grounding", []):
	for x_norm, y_norm in grounding.get("points", []):
	x = int(x_norm * original_width)
	y = int(y_norm * original_height)
	points.append([x, y])

	if points:
	points_array = np.array(points).reshape(1, -1, 2)
	key_points = sv.KeyPoints(xy=points_array)
	vertex_annotator = sv.VertexAnnotator(radius=5, color=sv.Color.RED)
	annotated_image = vertex_annotator.annotate(
	scene=annotated_image, key_points=key_points
	)

	if "objects" in json_data:
	detections = sv.Detections.from_vlm(sv.VLM.MOONDREAM,json_data,
	resolution_wh=(original_width,
	original_height))

	bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
	label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)

	labels = [label for _ in detections.xyxy]

	annotated_image = bounding_box_annotator.annotate(
	scene=annotated_image, detections=detections
	)
	annotated_image = label_annotator.annotate(
	scene=annotated_image, detections=detections, labels=labels
	)

	return Image.fromarray(annotated_image)


	def parse_qwen3_json(json_output):
	lines = json_output.splitlines()
	for i, line in enumerate(lines):
	if line == "```json":
	json_output = "\n".join(lines[i+1:])
	json_output = json_output.split("```")[0]
	break

	try:
	boxes = json.loads(json_output)
	except json.JSONDecodeError:
	end_idx = json_output.rfind('"}') + len('"}')
	truncated_text = json_output[:end_idx] + "]"
	boxes = json.loads(truncated_text)

	if not isinstance(boxes, list):
	boxes = [boxes]

	return boxes


	def create_annotated_image_qwen3(image, json_output):
	try:
	boxes = parse_qwen3_json(json_output)
	except Exception as e:
	print(f"Error parsing JSON: {e}")
	return image

	if not boxes:
	return image

	original_width, original_height = image.size
	annotated_image = np.array(image.convert("RGB"))

	xyxy = []
	labels = []

	for box in boxes:
	if "bbox_2d" in box and "label" in box:
	x1, y1, x2, y2 = box["bbox_2d"]
	scale = 1000
	x1 = max(0, min(scale, x1)) / scale * original_width
	y1 = max(0, min(scale, y1)) / scale * original_height
	x2 = max(0, min(scale, x2)) / scale * original_width
	y2 = max(0, min(scale, y2)) / scale * original_height
	# Ensure x1 <= x2 and y1 <= y2
	if x1 > x2: x1, x2 = x2, x1
	if y1 > y2: y1, y2 = y2, y1
	xyxy.append([int(x1), int(y1), int(x2), int(y2)])
	labels.append(box["label"])

	if not xyxy:
	return image

	detections = sv.Detections(
	xyxy=np.array(xyxy),
	class_id=np.arange(len(xyxy))
	)

	bounding_box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX)
	label_annotator = sv.LabelAnnotator(color_lookup=sv.ColorLookup.INDEX)

	annotated_image = bounding_box_annotator.annotate(
	scene=annotated_image, detections=detections
	)
	annotated_image = label_annotator.annotate(
	scene=annotated_image, detections=detections, labels=labels
	)

	return Image.fromarray(annotated_image)


	@GPU
	def detect_qwen(image, prompt):
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt},
	],
	}
	]

	t0 = time.perf_counter()
	inputs = processor_qwen.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_dict=True,
	return_tensors="pt"
	).to(model_qwen.device)

	generated_ids = model_qwen.generate(**inputs, max_new_tokens=1024)
	generated_ids_trimmed = [
	out_ids[len(in_ids) :]
	for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
	]
	output_text = processor_qwen.batch_decode(
	generated_ids_trimmed,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False,
	)[0]
	elapsed_ms = (time.perf_counter() - t0) * 1_000

	annotated_image = create_annotated_image_qwen3(image, output_text)

	time_taken = f"Inference time ({model_qwen_name}): {elapsed_ms:.0f} ms"

	return annotated_image, output_text, time_taken


	@GPU
	def detect_moondream(image, prompt, category_input):
	t0 = time.perf_counter()
	if category_input in ["Object Detection", "Visual Grounding + Object Detection"]:
	output_text = model_moondream.detect(image=image, object=prompt)
	elif category_input == "Visual Grounding + Keypoint Detection":
	output_text = model_moondream.point(image=image, object=prompt)
	else:
	output_text = model_moondream.query(
	image=image, question=prompt, reasoning=True
	)
	elapsed_ms = (time.perf_counter() - t0) * 1_000

	annotated_image = create_annotated_image_normalized(
	image=image, json_data=output_text, label="object"
	)

	time_taken = f"Inference time ({model_moondream_name}): {elapsed_ms:.0f} ms"
	return annotated_image, output_text, time_taken


	def detect(image, prompt_model_1, prompt_model_2, category_input):
	STANDARD_SIZE = (1024, 1024)
	image.thumbnail(STANDARD_SIZE)

	annotated_image_model_1, output_text_model_1, timing_1 = detect_qwen(
	image, prompt_model_1
	)
	annotated_image_model_2, output_text_model_2, timing_2 = detect_moondream(
	image, prompt_model_2, category_input
	)

	return (
	annotated_image_model_1,
	output_text_model_1,
	timing_1,
	annotated_image_model_2,
	output_text_model_2,
	timing_2,
	)


	css_hide_share = """
	button#gradio-share-link-button-0 {
	display: none !important;
	}
	"""

	with gr.Blocks(theme=Ocean(), css=css_hide_share) as demo:
	gr.Markdown("# 👓 Object Understanding with Vision Language Models")
	gr.Markdown(
	"### Explore object detection, visual grounding, keypoint detection, and/or object counting through natural language prompts."
	)
	gr.Markdown("""
	Powered by [Qwen3-VL 4B](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) and [Moondream 3 Preview](https://huggingface.co/moondream/moondream3-preview). Inspired by the tutorial [Object Detection and Visual Grounding with Qwen 2.5](https://pyimagesearch.com/2025/06/09/object-detection-and-visual-grounding-with-qwen-2-5/) on PyImageSearch.
	Moondream 3 uses the [moondream-preview](https://huggingface.co/vikhyatk/moondream2/blob/main/moondream.py), selecting `detect` for categories with "Object Detection" `point` for the ones with "Keypoint Detection", and reasoning-based querying for all others.
	""")

	with gr.Row():
	with gr.Column(scale=2):
	image_input = gr.Image(label="Upload an image", type="pil", height=400)
	prompt_input_model_1 = gr.Textbox(
	label=f"Enter your prompt for {model_qwen_name}",
	placeholder="e.g., Detect all red cars in the image",
	)

	prompt_input_model_2 = gr.Textbox(
	label=f"Enter your prompt for {model_moondream_name}",
	placeholder="e.g., Detect all blue cars in the image",
	)

	categories = [
	"Object Detection",
	"Object Counting",
	"Visual Grounding + Keypoint Detection",
	"Visual Grounding + Object Detection",
	"General query",
	]

	category_input = gr.Dropdown(
	choices=categories, label="Category", interactive=True
	)
	generate_btn = gr.Button(value="Generate")

	with gr.Column(scale=1):
	output_image_model_1 = gr.Image(
	type="pil", label=f"Annotated image for {model_qwen_name}", height=400
	)
	output_textbox_model_1 = gr.Textbox(
	label=f"Model response for {model_qwen_name}", lines=10
	)
	output_time_model_1 = gr.Markdown()

	with gr.Column(scale=1):
	output_image_model_2 = gr.Image(
	type="pil",
	label=f"Annotated image for {model_moondream_name}",
	height=400,
	)
	output_textbox_model_2 = gr.Textbox(
	label=f"Model response for {model_moondream_name}", lines=10
	)
	output_time_model_2 = gr.Markdown()

	gr.Markdown("### Examples")
	example_prompts = [
	[
	"examples/example_1.jpg",
	"locate every instance in the image. Report bbox coordinates in JSON format.",
	"objects",
	"Object Detection",
	],
	[
	"examples/example_2.JPG",
	'locate every instance that belongs to the following categories: "candy, hand". Report bbox coordinates in JSON format.',
	"candies",
	"Object Detection",
	],
	[
	"examples/example_1.jpg",
	"Count the number of red cars in the image.",
	"Count the number of red cars in the image.",
	"Object Counting",
	],
	[
	"examples/example_2.JPG",
	"Count the number of blue candies in the image.",
	"Count the number of blue candies in the image.",
	"Object Counting",
	],
	[
	"examples/example_1.jpg",
	'locate every instance that belongs to the following categories: "red car". Report bbox coordinates in JSON format..',
	"red cars",
	"Visual Grounding + Keypoint Detection",
	],
	[
	"examples/example_2.JPG",
	"Identify the blue candies in this image, detect their key points and return their positions in the form of points.",
	"blue candies",
	"Visual Grounding + Keypoint Detection",
	],
	[
	"examples/example_1.jpg",
	'locate every instance that belongs to the following categories: "leading red car". Report bbox coordinates in JSON format..',
	"leading red car",
	"Visual Grounding + Object Detection",
	],
	[
	"examples/example_2.JPG",
	'locate every instance that belongs to the following categories: "blue candy located at the top of the group". Report bbox coordinates in JSON format.',
	"blue candy located at the top of the group",
	"Visual Grounding + Object Detection",
	],
	]
	gr.Examples(
	examples=example_prompts,
	inputs=[
	image_input,
	prompt_input_model_1,
	prompt_input_model_2,
	category_input,
	],
	label="Click an example to populate the input",
	)

	generate_btn.click(
	fn=detect,
	inputs=[
	image_input,
	prompt_input_model_1,
	prompt_input_model_2,
	category_input,
	],
	outputs=[
	output_image_model_1,
	output_textbox_model_1,
	output_time_model_1,
	output_image_model_2,
	output_textbox_model_2,
	output_time_model_2,
	],
	)

	if __name__ == "__main__":
	demo.launch()