import base64, os # import spaces import json import torch import gradio as gr from typing import Optional from PIL import Image, ImageDraw import numpy as np import matplotlib.pyplot as plt from qwen_vl_utils import process_vision_info from datasets import load_dataset from transformers import AutoProcessor from gui_actor.constants import chat_template from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer from gui_actor.inference import inference MAX_PIXELS = 3200 * 1800 def resize_image(image, resize_to_pixels=MAX_PIXELS): image_width, image_height = image.size if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels): resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5 image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio) image = image.resize((image_width_resized, image_height_resized)) return image # @spaces.GPU @torch.inference_mode() def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)): overlay = Image.new('RGBA', image.size, (255, 255, 255, 0)) overlay_draw = ImageDraw.Draw(overlay) x, y = point overlay_draw.ellipse( [(x - radius, y - radius), (x + radius, y + radius)], outline=color, width=5 # Adjust thickness as needed ) image = image.convert('RGBA') combined = Image.alpha_composite(image, overlay) combined = combined.convert('RGB') return combined # @spaces.GPU @torch.inference_mode() def get_attn_map(image, attn_scores, n_width, n_height): w, h = image.size scores = np.array(attn_scores[0]).reshape(n_height, n_width) scores_norm = (scores - scores.min()) / (scores.max() - scores.min()) # Resize score map to match image size score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST) # BILINEAR) # Apply colormap colormap = plt.get_cmap('jet') colored_score_map = colormap(np.array(score_map) / 255.0) # returns RGBA colored_score_map = (colored_score_map[:, :, :3] * 255).astype(np.uint8) colored_overlay = Image.fromarray(colored_score_map) # Blend with original image blended = Image.blend(image, colored_overlay, alpha=0.3) return blended # load model if torch.cuda.is_available(): # os.system('pip install flash-attn --no-build-isolation') model_name_or_path = "microsoft/GUI-Actor-7B-Qwen2.5-VL" data_processor = AutoProcessor.from_pretrained(model_name_or_path) tokenizer = data_processor.tokenizer model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16, device_map="cuda:0", attn_implementation="flash_attention_2" ).eval() else: model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL" data_processor = AutoProcessor.from_pretrained(model_name_or_path) tokenizer = data_processor.tokenizer model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained( model_name_or_path, torch_dtype=torch.bfloat16, device_map="cpu" ).eval() title = "GUI-Actor" header = """