有手就行,教你从0到1快速手搓搭建个GUIAgent

admin 2026-01-20 01:18:09 网络安全文章 来源:ZONE.CI 全球网 0 阅读模式

文章总结: 本文介绍利用Gemini3Flash从零构建PCGUIAgent的方法。通过大模型理解界面,结合pyautogui操作,利用LangGraph编排感知、决策与执行的闭环。文章详细解析了记忆机制、坐标归一化及Prompt设计,提供完整代码,展示了低成本实现跨平台自动任务处理的方案,具备很强的可操作性,适合开发者快速入门实践。 综合评分: 90 文章分类: 实战经验


注意看这里!模型本身是有定位能力的(也就是说可以直接输出操作对象的坐标位置),所以不需要格外的感知器来锚定具体UI元素。

(6)Agent主程序

现在把所有模块组装起来:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GUI Agent - 自动化GUI测试Agent
截图 -> 模型决策 -> 解析Action -> 执行 -> 循环,直到finished
"""
import re
import json
from datetime import datetime
from typing import TypedDict
from pathlib import Path

from langgraph.graph import StateGraph, END
from operator.execute import Operation
from utils.model import LVMChat, Model
from utils.prompts import COMPUTER_USE_UITARS

# 定义State
class AgentState(TypedDict):
    instruction: str  # 用户指令
    screenshot_path: str  # 当前截图路径
    step: int  # 当前步骤
    thought: str  # 模型思考
    action: str  # 模型输出的动作
    finished: bool  # 是否完成

class GUIAgent:
    """GUI自动化Agent"""

    def __init__(self, instruction: str, model_name: str = Model.GOOGLE_GEMINI_3_FLASH_PREVIEW.value):
        self.instruction = instruction
        self.operation = Operation()
        self.lvm_chat = LVMChat(model=model_name)
        self.s_dir = Path("s")
        self.s_dir.mkdir(exist_ok=True)

        # 获取屏幕尺寸用于坐标映射
        import pyautogui
        self.screen_width, self.screen_height = pyautogui.size()
        print(f"🖥️  屏幕尺寸: {self.screen_width}x{self.screen_height}")

    def normalize_coords(self, x: int, y: int) -> tuple[int, int]:
        """将归一化坐标(0-1000)转换为实际像素坐标"""
        actual_x = int(x / 1000.0 * self.screen_width)
        actual_y = int(y / 1000.0 * self.screen_height)
        print(f"   归一化坐标 ({x}, {y}) -> 实际坐标 ({actual_x}, {actual_y})")
        return actual_x, actual_y

    def take_screenshot(self, state: AgentState) -> AgentState:
        """步骤1: 截图并保存"""
        step = state.get("step", 0) + 1
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        screenshot_path = str(self.s_dir / f"step_{step}_{timestamp}.png")

        self.operation.screenshot(screenshot_path)

        return {
            **state,
            "instruction": self.instruction,
            "screenshot_path": screenshot_path,
            "step": step,
            "finished": False
        }

    def model_decide(self, state: AgentState) -> AgentState:
        """步骤2: 模型决策(自动使用会话历史)"""
        prompt = COMPUTER_USE_UITARS.format(instruction=state["instruction"])

        # 调用多模态模型(use_history=True 自动保留上下文)
        response = self.lvm_chat.get_multimodal_response(
            text=prompt,
            image_paths=state["screenshot_path"],
            res_format="json",
            use_history=True# 启用会话历史,模型会记住之前的所有交互
        )

        print(f"\n📸 Step {state['step']} - 模型响应:\n{response}\n")

        # 解析JSON响应
        try:
            result = json.loads(response)
            thought = result.get("Thought", "")
            action = result.get("Action", "")
        except json.JSONDecodeError:
            # 如果不是JSON格式,尝试正则提取
            thought_match = re.search(r'"Thought":\s*"([^"]*)"', response)
            action_match = re.search(r'"Action":\s*"([^"]*)"', response)
            thought = thought_match.group(1) if thought_match else""
            action = action_match.group(1) if action_match else""

        return {
            **state,
            "thought": thought,
            "action": action
        }

    def execute_action(self, state: AgentState) -> AgentState:
        """步骤3: 解析并执行动作"""
        action = state["action"]

        ifnot action:
            print("⚠️ 没有可执行的动作")
            return {**state, "finished": True}

        # 检查是否完成
        if action.startswith("finished("):
            content_match = re.search(r"finished\(content='([^']*)'\)", action)
            content = content_match.group(1) if content_match else"任务完成"
            print(f"✅ 任务完成: {content}")
            return {**state, "finished": True}

        # 解析并执行动作
        try:
            self._parse_and_execute(action)
        except Exception as e:
            print(f"❌ 执行动作失败: {e}")
            print(f"   动作: {action}")

        return state

    def _parse_and_execute(self, action: str):
        """解析动作字符串并执行"""
        print(f"🔧 执行动作: {action}")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# click(point='<point>x y</point>') 或 click(point='x y')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;action.startswith("click("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"<point>(\d+)\s+(\d+)</point>", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x, y = int(point_match.group(1)), int(point_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x, actual_y = self.normalize_coords(x, y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.click(actual_x, actual_y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;else:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(f"⚠️ 无法解析点击坐标:&nbsp;{action}")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# left_double(point='<point>x y</point>') 或 double_click(point='x y')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("left_double("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"<point>(\d+)\s+(\d+)</point>", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x, y = int(point_match.group(1)), int(point_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x, actual_y = self.normalize_coords(x, y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.double_click(actual_x, actual_y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;else:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(f"⚠️ 无法解析双击坐标:&nbsp;{action}")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# type(content='xxx')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("type("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; content_match = re.search(r"content=['\"]([^'\"]*)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;content_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; text = content_match.group(1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 处理转义字符
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; text = text.replace(r"\'",&nbsp;"'").replace(r'\"',&nbsp;'"').replace(r"\n",&nbsp;"\n")
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.input(text)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# hotkey(key='ctrl c')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("hotkey("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; key_match = re.search(r"key=['\"]([^'\"]*)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;key_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; keys = key_match.group(1).split()
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.hotkey(*keys)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# scroll(point='<point>x y</point>', direction='down') 或 scroll(point='x y', direction='down')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("scroll("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"<point>(\d+)\s+(\d+)</point>", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; direction_match = re.search(r"direction=['\"]([^'\"]*)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;point_match&nbsp;and&nbsp;direction_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x, y = int(point_match.group(1)), int(point_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x, actual_y = self.normalize_coords(x, y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; direction = direction_match.group(1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 移动到位置并滚动
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;import&nbsp;pyautogui
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.moveTo(actual_x, actual_y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; scroll_amount =&nbsp;3if&nbsp;direction&nbsp;in&nbsp;["up",&nbsp;"left"]&nbsp;else-3
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.scroll(scroll_amount)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# wait()
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("wait("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.wait(seconds=5)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("drag("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; start_match = re.search(r"start_point=['\"]<point>(\d+)\s+(\d+)</point>['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; end_match = re.search(r"end_point=['\"]<point>(\d+)\s+(\d+)</point>['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;start_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; start_match = re.search(r"start_point=['\"](\d+)\s+(\d+)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; end_match = re.search(r"end_point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;start_match&nbsp;and&nbsp;end_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x1, y1 = int(start_match.group(1)), int(start_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x2, y2 = int(end_match.group(1)), int(end_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x1, actual_y1 = self.normalize_coords(x1, y1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x2, actual_y2 = self.normalize_coords(x2, y2)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;import&nbsp;pyautogui
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.moveTo(actual_x1, actual_y1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.drag(actual_x2 - actual_x1, actual_y2 - actual_y1, duration=0.5)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 等待一下让界面响应
&nbsp; &nbsp; &nbsp; &nbsp; self.operation.wait(seconds=1)

&nbsp; &nbsp;&nbsp;def&nbsp;should_continue(self, state: AgentState)&nbsp;-> str:
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""判断是否继续循环"""
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return"end"if&nbsp;state.get("finished",&nbsp;False)&nbsp;else"continue"

&nbsp; &nbsp;&nbsp;def&nbsp;run(self):
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""运行Agent"""
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 构建graph
&nbsp; &nbsp; &nbsp; &nbsp; workflow = StateGraph(AgentState)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 添加节点
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_node("screenshot", self.take_screenshot)
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_node("decide", self.model_decide)
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_node("execute", self.execute_action)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 添加边
&nbsp; &nbsp; &nbsp; &nbsp; workflow.set_entry_point("screenshot")
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_edge("screenshot",&nbsp;"decide")
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_edge("decide",&nbsp;"execute")
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_conditional_edges(
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"execute",
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.should_continue,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"continue":&nbsp;"screenshot",
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"end": END
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; )

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 编译并运行
&nbsp; &nbsp; &nbsp; &nbsp; app = workflow.compile()

&nbsp; &nbsp; &nbsp; &nbsp; print(f"🚀 开始执行任务:&nbsp;{self.instruction}\n")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 设置递归限制为100步
&nbsp; &nbsp; &nbsp; &nbsp; config = {"recursion_limit":&nbsp;100}
&nbsp; &nbsp; &nbsp; &nbsp; final_state = app.invoke(
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {"instruction": self.instruction,&nbsp;"step":&nbsp;0},
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; config=config
&nbsp; &nbsp; &nbsp; &nbsp; )

&nbsp; &nbsp; &nbsp; &nbsp; print(f"\n🎉 任务完成! 共执行&nbsp;{final_state['step']}&nbsp;步")
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;final_state

if&nbsp;__name__ ==&nbsp;"__main__":
&nbsp; &nbsp; agent = GUIAgent(instruction="""打开浏览器查询GUI, 找到wikipedia的介绍页面进行查看""")
&nbsp; &nbsp; agent.run()

4. 执行效果

文章前言部分的demo, 模型的部分决策内容如下:

对于中间步骤,模型会利用到上文的内容,与当前页面状态一起作为决策依据:

最后,如果模型判断任务完成,会输出finished指令,程序停止

四、总结

本文采用简洁易用的方案搭建了一个 PC 端的 GUI Agent,该 Agent 不仅能在 Windows 和 macOS 系统上直接运行,还可操作 Web 应用。得益于以 pyautogui(键鼠模拟)作为核心执行器,该 Agent 能够实现跨应用的操作能力。未来可通过补充滑动、拖拽等更多样的交互方式,并结合知识库的构建,进一步强化其针对特定业务场景的适配性,打造功能更强大的 GUI Agent。


免责声明:

本文所载程序、技术方法仅面向合法合规的安全研究与教学场景,旨在提升网络安全防护能力,具有明确的技术研究属性。

任何单位或个人未经授权,将本文内容用于攻击、破坏等非法用途的,由此引发的全部法律责任、民事赔偿及连带责任,均由行为人独立承担,本站不承担任何连带责任。

本站内容均为技术交流与知识分享目的发布,若存在版权侵权或其他异议,请通过邮件联系处理,具体联系方式可点击页面上方的联系我

本文转载自:腾讯技术工程 腾讯程序员 腾讯程序员《有手就行,教你从0到1快速手搓搭建个GUI Agent》

评论:0   参与:  0