🎯 GUI-Actor: Coordinate-Free Visual Grounding for GUI Agents

import base64, os
import json
import torch
import gradio as gr
import argparse  # 新增：导入argparse
from typing import Optional
from PIL import Image, ImageDraw
import numpy as np
import matplotlib.pyplot as plt
from qwen_vl_utils import process_vision_info
from datasets import load_dataset
from transformers import AutoProcessor
from gui_actor.constants import chat_template
from gui_actor.modeling_qwen25vl import Qwen2_5_VLForConditionalGenerationWithPointer
from gui_actor.inference import inference

MAX_PIXELS = 3200 * 1800

def resize_image(image, resize_to_pixels=MAX_PIXELS):
    image_width, image_height = image.size
    if (resize_to_pixels is not None) and ((image_width * image_height) != resize_to_pixels):
        resize_ratio = (resize_to_pixels / (image_width * image_height)) ** 0.5
        image_width_resized, image_height_resized = int(image_width * resize_ratio), int(image_height * resize_ratio)
        image = image.resize((image_width_resized, image_height_resized))
    return image

@torch.inference_mode()
def draw_point(image: Image.Image, point: list, radius=8, color=(255, 0, 0, 128)):
    overlay = Image.new('RGBA', image.size, (255, 255, 255, 0))
    overlay_draw = ImageDraw.Draw(overlay)
    x, y = point
    overlay_draw.ellipse(
        [(x - radius, y - radius), (x + radius, y + radius)],
        outline=color,
        width=5
    )
    image = image.convert('RGBA')
    combined = Image.alpha_composite(image, overlay)
    combined = combined.convert('RGB')
    return combined

@torch.inference_mode()
def get_attn_map(image, attn_scores, n_width, n_height):
    w, h = image.size
    scores = np.array(attn_scores[0]).reshape(n_height, n_width)

    scores_norm = (scores - scores.min()) / (scores.max() - scores.min())
    score_map = Image.fromarray((scores_norm * 255).astype(np.uint8)).resize((w, h), resample=Image.NEAREST)
    colormap = plt.get_cmap('jet')
    colored_score_map = colormap(np.array(score_map) / 255.0)
    colored_score_map = (colored_score_map[:, :, :3] * 255).astype(np.uint8)
    colored_overlay = Image.fromarray(colored_score_map)

    blended = Image.blend(image, colored_overlay, alpha=0.3)
    return blended

# 加载模型
if torch.cuda.is_available():
    model_name_or_path = "microsoft/GUI-Actor-7B-Qwen2.5-VL"
    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
    tokenizer = data_processor.tokenizer
    model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
        model_name_or_path,
        torch_dtype=torch.bfloat16,
        device_map="cuda:0",
        attn_implementation="flash_attention_2"
    ).eval()
else:
    model_name_or_path = "microsoft/GUI-Actor-3B-Qwen2.5-VL"
    data_processor = AutoProcessor.from_pretrained(model_name_or_path)
    tokenizer = data_processor.tokenizer
    model = Qwen2_5_VLForConditionalGenerationWithPointer.from_pretrained(
        model_name_or_path,
        torch_dtype=torch.bfloat16,
        device_map="cpu"
    ).eval()

title = "GUI-Actor"
header = """
<div align="center">
    <h1 style="padding-bottom: 10px; padding-top: 10px;">🎯 <strong>GUI-Actor</strong>: Coordinate-Free Visual Grounding for GUI Agents</h1>
    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
        Qianhui Wu*, Kanzhi Cheng*, Rui Yang*, Chaoyun Zhang, Jianwei Yang, Huiqiang Jiang, Jian Mu, Baolin Peng, Bo Qiao, Reuben Tan, Si Qin, Lars Liden<br>
        Qingwei Lin, Huan Zhang, Tong Zhang, Jianbing Zhang, Dongmei Zhang, Jianfeng Gao<br/>
    </div>
    <div style="padding-bottom: 10px; padding-top: 10px; font-size: 16px;">
        <a href="https://microsoft.github.io/GUI-Actor/">🌐 Project Page</a> | <a href="https://arxiv.org/abs/2403.12968">📄 arXiv Paper</a> | <a href="https://github.com/microsoft/GUI-Actor">💻 Github Repo</a><br/>
    </div>
</div>
"""

theme = "soft"
css = """#anno-img .mask {opacity: 0.5; transition: all 0.2s ease-in-out;}
            #anno-img .mask.active {opacity: 0.7}"""

@torch.inference_mode()
def process(image, instruction):
    # 调整图像大小
    w, h = image.size
    if w * h > MAX_PIXELS:
        image = resize_image(image)

    conversation = [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": "You are a GUI agent. Given a screenshot of the current GUI and a human instruction, your task is to locate the screen element that corresponds to the instruction. You should output a PyAutoGUI action that performs a click on the correct position. To indicate the click location, we will use some special tokens, which is used to refer to a visual patch later. For example, you can output: pyautogui.click(<your_special_token_here>).",
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image,
                },
                {
                    "type": "text",
                    "text": instruction,
                },
            ],
        },
    ]

    try:
        pred = inference(conversation, model, tokenizer, data_processor, use_placeholder=True, topk=3)
    except Exception as e:
        print(e)
        return image, f"Error: {e}", None
    
    px, py = pred["topk_points"][0]
    output_coord = f"({px:.4f}, {py:.4f})"
    img_with_point = draw_point(image, (px * w, py * h))

    n_width, n_height = pred["n_width"], pred["n_height"]
    attn_scores = pred["attn_scores"]
    att_map = get_attn_map(image, attn_scores, n_width, n_height)
    
    return img_with_point, output_coord, att_map


def main():  # 新增：主函数，使用argparse解析参数
    parser = argparse.ArgumentParser(description="GUI-Actor 服务")
    parser.add_argument("--port", type=int, default=9876, help="服务端口（默认：9876）")
    parser.add_argument("--host", default="localhost", help="服务主机（默认：localhost）")
    
    args = parser.parse_args()
    
    # 创建Gradio界面
    with gr.Blocks(title=title, css=css) as demo:
        gr.Markdown(header)
        with gr.Row():
            with gr.Column():
                input_image = gr.Image(
                    type='pil', label='Upload image')
                input_instruction = gr.Textbox(label='Instruction', placeholder='Text your (low-level) instruction here')
                submit_button = gr.Button(
                    value='Submit', variant='primary')
            with gr.Column():
                image_with_point = gr.Image(type='pil', label='Image with Point (red circle)')
                with gr.Accordion('Detailed prediction'):
                    pred_xy = gr.Textbox(label='Predicted Coordinates', placeholder='(x, y)')
                    att_map = gr.Image(type='pil', label='Attention Map')

        submit_button.click(
            fn=process,
            inputs=[
                input_image,
                input_instruction
            ],
            outputs=[image_with_point, pred_xy, att_map]
        )

    # 启动服务（使用解析的参数）
    print(f"🚀 GUI-Actor 服务启动中...")
    print(f"🌐 访问地址: http://{args.host}:{args.port}")
    
    demo.queue().launch(
        server_name=args.host,
        server_port=args.port,
        share=True
    )

if __name__ == "__main__":  # 新增：程序入口
    main()