有手就行，教你从0到1快速手搓搭建个GUIAgent

2026-01-20 01:18:09 网络安全文章来源：ZONE.CI 全球网 0 阅读模式

文章总结： 本文介绍利用Gemini3Flash从零构建PCGUIAgent的方法。通过大模型理解界面，结合pyautogui操作，利用LangGraph编排感知、决策与执行的闭环。文章详细解析了记忆机制、坐标归一化及Prompt设计，提供完整代码，展示了低成本实现跨平台自动任务处理的方案，具备很强的可操作性，适合开发者快速入门实践。 综合评分： 90 文章分类： 实战经验

注意看这里！模型本身是有定位能力的（也就是说可以直接输出操作对象的坐标位置），所以不需要格外的感知器来锚定具体UI元素。

（6）Agent主程序

现在把所有模块组装起来：

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
GUI Agent - 自动化GUI测试Agent
截图 -> 模型决策 -> 解析Action -> 执行 -> 循环，直到finished
"""
import&nbsp;re
import&nbsp;json
from&nbsp;datetime&nbsp;import&nbsp;datetime
from&nbsp;typing&nbsp;import&nbsp;TypedDict
from&nbsp;pathlib&nbsp;import&nbsp;Path

from&nbsp;langgraph.graph&nbsp;import&nbsp;StateGraph, END
from&nbsp;operator.execute&nbsp;import&nbsp;Operation
from&nbsp;utils.model&nbsp;import&nbsp;LVMChat, Model
from&nbsp;utils.prompts&nbsp;import&nbsp;COMPUTER_USE_UITARS

# 定义State
class&nbsp;AgentState(TypedDict):
&nbsp; &nbsp; instruction: str &nbsp;# 用户指令
&nbsp; &nbsp; screenshot_path: str &nbsp;# 当前截图路径
&nbsp; &nbsp; step: int &nbsp;# 当前步骤
&nbsp; &nbsp; thought: str &nbsp;# 模型思考
&nbsp; &nbsp; action: str &nbsp;# 模型输出的动作
&nbsp; &nbsp; finished: bool &nbsp;# 是否完成

class&nbsp;GUIAgent:
&nbsp; &nbsp;&nbsp;"""GUI自动化Agent"""

&nbsp; &nbsp;&nbsp;def&nbsp;__init__(self, instruction: str, model_name: str = Model.GOOGLE_GEMINI_3_FLASH_PREVIEW.value):
&nbsp; &nbsp; &nbsp; &nbsp; self.instruction = instruction
&nbsp; &nbsp; &nbsp; &nbsp; self.operation = Operation()
&nbsp; &nbsp; &nbsp; &nbsp; self.lvm_chat = LVMChat(model=model_name)
&nbsp; &nbsp; &nbsp; &nbsp; self.s_dir = Path("s")
&nbsp; &nbsp; &nbsp; &nbsp; self.s_dir.mkdir(exist_ok=True)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 获取屏幕尺寸用于坐标映射
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;import&nbsp;pyautogui
&nbsp; &nbsp; &nbsp; &nbsp; self.screen_width, self.screen_height = pyautogui.size()
&nbsp; &nbsp; &nbsp; &nbsp; print(f"🖥️ &nbsp;屏幕尺寸:&nbsp;{self.screen_width}x{self.screen_height}")

&nbsp; &nbsp;&nbsp;def&nbsp;normalize_coords(self, x: int, y: int)&nbsp;-> tuple[int, int]:
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""将归一化坐标(0-1000)转换为实际像素坐标"""
&nbsp; &nbsp; &nbsp; &nbsp; actual_x = int(x /&nbsp;1000.0&nbsp;* self.screen_width)
&nbsp; &nbsp; &nbsp; &nbsp; actual_y = int(y /&nbsp;1000.0&nbsp;* self.screen_height)
&nbsp; &nbsp; &nbsp; &nbsp; print(f" &nbsp; 归一化坐标 ({x},&nbsp;{y}) -> 实际坐标 ({actual_x},&nbsp;{actual_y})")
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;actual_x, actual_y

&nbsp; &nbsp;&nbsp;def&nbsp;take_screenshot(self, state: AgentState)&nbsp;-> AgentState:
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""步骤1: 截图并保存"""
&nbsp; &nbsp; &nbsp; &nbsp; step = state.get("step",&nbsp;0) +&nbsp;1
&nbsp; &nbsp; &nbsp; &nbsp; timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
&nbsp; &nbsp; &nbsp; &nbsp; screenshot_path = str(self.s_dir /&nbsp;f"step_{step}_{timestamp}.png")

&nbsp; &nbsp; &nbsp; &nbsp; self.operation.screenshot(screenshot_path)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;{
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; **state,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"instruction": self.instruction,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"screenshot_path": screenshot_path,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"step": step,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"finished":&nbsp;False
&nbsp; &nbsp; &nbsp; &nbsp; }

&nbsp; &nbsp;&nbsp;def&nbsp;model_decide(self, state: AgentState)&nbsp;-> AgentState:
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""步骤2: 模型决策（自动使用会话历史）"""
&nbsp; &nbsp; &nbsp; &nbsp; prompt = COMPUTER_USE_UITARS.format(instruction=state["instruction"])

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 调用多模态模型（use_history=True 自动保留上下文）
&nbsp; &nbsp; &nbsp; &nbsp; response = self.lvm_chat.get_multimodal_response(
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; text=prompt,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; image_paths=state["screenshot_path"],
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; res_format="json",
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; use_history=True# 启用会话历史，模型会记住之前的所有交互
&nbsp; &nbsp; &nbsp; &nbsp; )

&nbsp; &nbsp; &nbsp; &nbsp; print(f"\n📸 Step&nbsp;{state['step']}&nbsp;- 模型响应:\n{response}\n")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 解析JSON响应
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;try:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; result = json.loads(response)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; thought = result.get("Thought",&nbsp;"")
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; action = result.get("Action",&nbsp;"")
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;except&nbsp;json.JSONDecodeError:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 如果不是JSON格式，尝试正则提取
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; thought_match = re.search(r'"Thought":\s*"([^"]*)"', response)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; action_match = re.search(r'"Action":\s*"([^"]*)"', response)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; thought = thought_match.group(1)&nbsp;if&nbsp;thought_match&nbsp;else""
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; action = action_match.group(1)&nbsp;if&nbsp;action_match&nbsp;else""

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;{
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; **state,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"thought": thought,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"action": action
&nbsp; &nbsp; &nbsp; &nbsp; }

&nbsp; &nbsp;&nbsp;def&nbsp;execute_action(self, state: AgentState)&nbsp;-> AgentState:
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""步骤3: 解析并执行动作"""
&nbsp; &nbsp; &nbsp; &nbsp; action = state["action"]

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;action:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print("⚠️ 没有可执行的动作")
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;{**state,&nbsp;"finished":&nbsp;True}

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 检查是否完成
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;action.startswith("finished("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; content_match = re.search(r"finished\(content='([^']*)'\)", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; content = content_match.group(1)&nbsp;if&nbsp;content_match&nbsp;else"任务完成"
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(f"✅ 任务完成:&nbsp;{content}")
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;{**state,&nbsp;"finished":&nbsp;True}

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 解析并执行动作
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;try:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self._parse_and_execute(action)
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;except&nbsp;Exception&nbsp;as&nbsp;e:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(f"❌ 执行动作失败:&nbsp;{e}")
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(f" &nbsp; 动作:&nbsp;{action}")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;state

&nbsp; &nbsp;&nbsp;def&nbsp;_parse_and_execute(self, action: str):
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""解析动作字符串并执行"""
&nbsp; &nbsp; &nbsp; &nbsp; print(f"🔧 执行动作:&nbsp;{action}")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# click(point='<point>x y</point>') 或 click(point='x y')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;action.startswith("click("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"<point>(\d+)\s+(\d+)</point>", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x, y = int(point_match.group(1)), int(point_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x, actual_y = self.normalize_coords(x, y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.click(actual_x, actual_y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;else:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(f"⚠️ 无法解析点击坐标:&nbsp;{action}")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# left_double(point='<point>x y</point>') 或 double_click(point='x y')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("left_double("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"<point>(\d+)\s+(\d+)</point>", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x, y = int(point_match.group(1)), int(point_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x, actual_y = self.normalize_coords(x, y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.double_click(actual_x, actual_y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;else:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; print(f"⚠️ 无法解析双击坐标:&nbsp;{action}")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# type(content='xxx')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("type("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; content_match = re.search(r"content=['\"]([^'\"]*)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;content_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; text = content_match.group(1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 处理转义字符
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; text = text.replace(r"\'",&nbsp;"'").replace(r'\"',&nbsp;'"').replace(r"\n",&nbsp;"\n")
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.input(text)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# hotkey(key='ctrl c')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("hotkey("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; key_match = re.search(r"key=['\"]([^'\"]*)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;key_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; keys = key_match.group(1).split()
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.hotkey(*keys)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# scroll(point='<point>x y</point>', direction='down') 或 scroll(point='x y', direction='down')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("scroll("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"<point>(\d+)\s+(\d+)</point>", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;point_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; point_match = re.search(r"point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; direction_match = re.search(r"direction=['\"]([^'\"]*)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;point_match&nbsp;and&nbsp;direction_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x, y = int(point_match.group(1)), int(point_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x, actual_y = self.normalize_coords(x, y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; direction = direction_match.group(1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 移动到位置并滚动
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;import&nbsp;pyautogui
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.moveTo(actual_x, actual_y)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; scroll_amount =&nbsp;3if&nbsp;direction&nbsp;in&nbsp;["up",&nbsp;"left"]&nbsp;else-3
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.scroll(scroll_amount)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# wait()
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("wait("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.operation.wait(seconds=5)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;elif&nbsp;action.startswith("drag("):
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; start_match = re.search(r"start_point=['\"]<point>(\d+)\s+(\d+)</point>['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; end_match = re.search(r"end_point=['\"]<point>(\d+)\s+(\d+)</point>['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;ifnot&nbsp;start_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 尝试不带标签的格式
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; start_match = re.search(r"start_point=['\"](\d+)\s+(\d+)['\"]", action)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; end_match = re.search(r"end_point=['\"](\d+)\s+(\d+)['\"]", action)

&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;if&nbsp;start_match&nbsp;and&nbsp;end_match:
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x1, y1 = int(start_match.group(1)), int(start_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; x2, y2 = int(end_match.group(1)), int(end_match.group(2))
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x1, actual_y1 = self.normalize_coords(x1, y1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; actual_x2, actual_y2 = self.normalize_coords(x2, y2)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;import&nbsp;pyautogui
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.moveTo(actual_x1, actual_y1)
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; pyautogui.drag(actual_x2 - actual_x1, actual_y2 - actual_y1, duration=0.5)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 等待一下让界面响应
&nbsp; &nbsp; &nbsp; &nbsp; self.operation.wait(seconds=1)

&nbsp; &nbsp;&nbsp;def&nbsp;should_continue(self, state: AgentState)&nbsp;-> str:
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""判断是否继续循环"""
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return"end"if&nbsp;state.get("finished",&nbsp;False)&nbsp;else"continue"

&nbsp; &nbsp;&nbsp;def&nbsp;run(self):
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"""运行Agent"""
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 构建graph
&nbsp; &nbsp; &nbsp; &nbsp; workflow = StateGraph(AgentState)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 添加节点
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_node("screenshot", self.take_screenshot)
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_node("decide", self.model_decide)
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_node("execute", self.execute_action)

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 添加边
&nbsp; &nbsp; &nbsp; &nbsp; workflow.set_entry_point("screenshot")
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_edge("screenshot",&nbsp;"decide")
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_edge("decide",&nbsp;"execute")
&nbsp; &nbsp; &nbsp; &nbsp; workflow.add_conditional_edges(
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"execute",
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; self.should_continue,
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"continue":&nbsp;"screenshot",
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp;"end": END
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; }
&nbsp; &nbsp; &nbsp; &nbsp; )

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 编译并运行
&nbsp; &nbsp; &nbsp; &nbsp; app = workflow.compile()

&nbsp; &nbsp; &nbsp; &nbsp; print(f"🚀 开始执行任务:&nbsp;{self.instruction}\n")

&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;# 设置递归限制为100步
&nbsp; &nbsp; &nbsp; &nbsp; config = {"recursion_limit":&nbsp;100}
&nbsp; &nbsp; &nbsp; &nbsp; final_state = app.invoke(
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; {"instruction": self.instruction,&nbsp;"step":&nbsp;0},
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; config=config
&nbsp; &nbsp; &nbsp; &nbsp; )

&nbsp; &nbsp; &nbsp; &nbsp; print(f"\n🎉 任务完成! 共执行&nbsp;{final_state['step']}&nbsp;步")
&nbsp; &nbsp; &nbsp; &nbsp;&nbsp;return&nbsp;final_state

if&nbsp;__name__ ==&nbsp;"__main__":
&nbsp; &nbsp; agent = GUIAgent(instruction="""打开浏览器查询GUI, 找到wikipedia的介绍页面进行查看""")
&nbsp; &nbsp; agent.run()

4. 执行效果

文章前言部分的demo, 模型的部分决策内容如下:

对于中间步骤，模型会利用到上文的内容，与当前页面状态一起作为决策依据：

最后，如果模型判断任务完成，会输出finished指令，程序停止

四、总结

本文采用简洁易用的方案搭建了一个 PC 端的 GUI Agent，该 Agent 不仅能在 Windows 和 macOS 系统上直接运行，还可操作 Web 应用。得益于以 pyautogui（键鼠模拟）作为核心执行器，该 Agent 能够实现跨应用的操作能力。未来可通过补充滑动、拖拽等更多样的交互方式，并结合知识库的构建，进一步强化其针对特定业务场景的适配性，打造功能更强大的 GUI Agent。

免责声明：

本文所载程序、技术方法仅面向合法合规的安全研究与教学场景，旨在提升网络安全防护能力，具有明确的技术研究属性。

任何单位或个人未经授权，将本文内容用于攻击、破坏等非法用途的，由此引发的全部法律责任、民事赔偿及连带责任，均由行为人独立承担，本站不承担任何连带责任。

本站内容均为技术交流与知识分享目的发布，若存在版权侵权或其他异议，请通过邮件联系处理，具体联系方式可点击页面上方的联系我。

本文转载自：腾讯技术工程腾讯程序员腾讯程序员《有手就行，教你从0到1快速手搓搭建个GUI Agent》