Spaces:

techdrizzdev
/

ui_model_eval

Runtime error

App Files Files Community

ui_model_eval / app.py

techdrizzdev

Create app.py

18c16bc verified 10 months ago

raw

history blame

8.56 kB

	import base64
	import json
	from datetime import datetime
	import torch
	import spaces
	from PIL import Image, ImageDraw
	from qwen_vl_utils import process_vision_info
	from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForCausalLM, AutoTokenizer
	from PIL import Image
	import ast
	import os
	from datetime import datetime
	import numpy as np
	from huggingface_hub import hf_hub_download, list_repo_files
	import gradio as gr
	import time

	# Define constants
	_SYSTEM = "Based on the screenshot of the page, I give a text description and you give its corresponding location. The coordinate represents a clickable location [x, y] for an element, which is a relative coordinate on the screenshot, scaled from 0 to 1."
	MIN_PIXELS = 256 * 28 * 28
	MAX_PIXELS = 1344 * 28 * 28

	# Specify the model repository and destination folder
	model_repo = "showlab/ShowUI-2B"
	destination_folder = "./showui-2b"

	# Ensure the destination folder exists
	os.makedirs(destination_folder, exist_ok=True)

	# List all files in the repository
	files = list_repo_files(repo_id=model_repo)

	# Download each file to the destination folder
	for file in files:
	file_path = hf_hub_download(repo_id=model_repo, filename=file, local_dir=destination_folder)
	print(f"Downloaded {file} to {file_path}")

	model = Qwen2VLForConditionalGeneration.from_pretrained(
	"./showui-2b",
	# "showlab/ShowUI-2B",
	torch_dtype=torch.bfloat16,
	device_map="cuda",
	)

	# Load the processor
	processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", min_pixels=MIN_PIXELS, max_pixels=MAX_PIXELS)

	model_moon = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2", revision="2025-01-09", trust_remote_code=True, device_map={"": "cuda"})


	# Helper functions
	def draw_point(image_input, point=None, radius=5):
	"""Draw a point on the image."""
	if isinstance(image_input, str):
	image = Image.open(image_input)
	else:
	image = Image.fromarray(np.uint8(image_input))

	if point:
	x, y = point[0] * image.width, point[1] * image.height
	ImageDraw.Draw(image).ellipse((x - radius, y - radius, x + radius, y + radius), fill="red")
	return image


	def array_to_image_path(image_array):
	"""Save the uploaded image and return its path."""
	if image_array is None:
	raise ValueError("No image provided. Please upload an image before submitting.")
	img = Image.fromarray(np.uint8(image_array))
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"image_{timestamp}.png"
	img.save(filename)
	return os.path.abspath(filename)


	def infer_moon(img, query):
	start = time.time()
	image = Image.fromarray(np.uint8(img))
	points = model_moon.point(image, query)["points"]
	converted_data = [round(points[0]["x"], 2), round(points[0]["y"], 2)]
	end = time.time()
	total_time = end - start
	return converted_data, f"{round(total_time, 2)} seconds"


	def infer_showui(image_path, query):
	start = time.time()
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": _SYSTEM},
	{"type": "image", "image": image_path, "min_pixels": MIN_PIXELS, "max_pixels": MAX_PIXELS},
	{"type": "text", "text": query},
	],
	}
	]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
	inputs = inputs.to("cuda")

	# Generate output
	generated_ids = model.generate(**inputs, max_new_tokens=128)
	generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
	output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

	# Parse the output into coordinates
	click_xy = ast.literal_eval(output_text)
	end = time.time()
	total_time = end - start
	return click_xy, f"{round(total_time, 2)} seconds"


	def run(image, query):
	"""Main function for inference."""
	image_path = array_to_image_path(image)
	moon, time_taken_moon = infer_moon(image, query)
	showui, time_taken_showui = infer_showui(image_path, query)

	# Draw the point on the image
	result_image = draw_point(image_path, showui, radius=10)
	result_moon_image = draw_point(image_path, moon, radius=10)
	return result_image, time_taken_showui, result_moon_image, time_taken_moon


	def build_demo():
	with gr.Blocks(title="ShowUI Demo", theme=gr.themes.Default()) as demo:
	# State to store the consistent image path
	state_image_path = gr.State(value=None)

	with gr.Row():
	with gr.Column(scale=3):
	# Input components
	imagebox = gr.Image(type="numpy", label="Input Screenshot")
	textbox = gr.Textbox(
	show_label=True,
	placeholder="Enter a query (e.g., 'Click Nahant')",
	label="Query",
	)
	submit_btn = gr.Button(value="Submit", variant="primary")

	# Placeholder examples
	gr.Examples(
	examples=[
	["./examples/app_store.png", "Download Kindle."],
	["./examples/ios_setting.png", "Turn off Do not disturb."],
	["./examples/image_13.png", "Tap on vehicle search."],
	["./examples/map.png", "Boston."],
	["./examples/wallet.png", "Scan a QR code."],
	["./examples/word.png", "More shapes."],
	["./examples/web_shopping.png", "Proceed to checkout."],
	["./examples/web_forum.png", "Post my comment."],
	["./examples/safari_google.png", "Click on search bar."],
	],
	inputs=[imagebox, textbox],
	examples_per_page=3,
	)

	with gr.Column(scale=8):
	# Output components
	output_img1 = gr.Image(type="pil", label="Show UI Output")
	output_time1 = gr.Text(label="showui inference time")
	output_img2 = gr.Image(type="pil", label="Moon dream Output")
	output_time2 = gr.Text(label="moondream inference time")

	# Add a note below the images to explain the red point
	gr.HTML(
	"""
	<p><strong>Note:</strong> The <span style="color: red;">red point</span> on the output images represents the predicted clickable coordinates.</p>
	"""
	)

	# Buttons for voting, flagging, regenerating, and clearing
	with gr.Row(elem_id="action-buttons", equal_height=True):
	regenerate_btn = gr.Button(value="🔄 Regenerate", variant="secondary")
	clear_btn = gr.Button(value="🗑️ Clear", interactive=True) # Combined Clear button

	# Define button actions
	def on_submit(image, query):
	"""Handle the submit button click."""
	if image is None:
	raise ValueError("No image provided. Please upload an image before submitting.")

	# Generate consistent image path and store it in the state
	image_path = array_to_image_path(image)
	return run(image, query) + (image_path,)

	submit_btn.click(
	on_submit,
	[imagebox, textbox],
	[output_img1, output_time1, output_img2, output_time2, state_image_path],
	)

	clear_btn.click(
	lambda: (None, None, None, None, None),
	inputs=None,
	outputs=[imagebox, textbox, output_img1, output_img2, state_image_path], # Clear all outputs
	queue=False,
	)

	regenerate_btn.click(
	lambda image, query, state_image_path: run(image, query),
	[imagebox, textbox, state_image_path],
	[output_img1, output_time1, output_img2, output_time2],
	)

	return demo


	if __name__ == "__main__":
	demo = build_demo()
	demo.queue(api_open=False).launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False, debug=True, share=True)