From 7c18615daf33df3372342324e111f160cfbe343e Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Tue, 23 Jun 2026 20:56:26 +0800 Subject: [PATCH] Add pre-action grounding guard (bounds check + snap-to-element) --- README/WHATS_NEW_zh-CN.md | 6 ++ README/WHATS_NEW_zh-TW.md | 6 ++ WHATS_NEW.md | 6 ++ .../doc/new_features/v153_features_doc.rst | 40 ++++++++++ docs/source/Eng/eng_index.rst | 1 + .../Zh/doc/new_features/v153_features_doc.rst | 33 +++++++++ docs/source/Zh/zh_index.rst | 1 + je_auto_control/__init__.py | 7 ++ .../gui/script_builder/command_schema.py | 13 ++++ .../utils/action_grounding/__init__.py | 6 ++ .../action_grounding/action_grounding.py | 74 +++++++++++++++++++ .../utils/executor/action_executor.py | 19 +++++ .../utils/mcp_server/tools/_factories.py | 22 +++++- .../utils/mcp_server/tools/_handlers.py | 5 ++ .../headless/test_action_grounding_batch.py | 62 ++++++++++++++++ 15 files changed, 300 insertions(+), 1 deletion(-) create mode 100644 docs/source/Eng/doc/new_features/v153_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v153_features_doc.rst create mode 100644 je_auto_control/utils/action_grounding/__init__.py create mode 100644 je_auto_control/utils/action_grounding/action_grounding.py create mode 100644 test/unit_test/headless/test_action_grounding_batch.py diff --git a/README/WHATS_NEW_zh-CN.md b/README/WHATS_NEW_zh-CN.md index 86f8c4d0..caa23b23 100644 --- a/README/WHATS_NEW_zh-CN.md +++ b/README/WHATS_NEW_zh-CN.md @@ -1,5 +1,11 @@ # 本次更新 — AutoControl +## 本次更新 (2026-06-23) — 动作前接地防护 + +拒绝越界点击;把接近偏离者吸附到真正的元素。完整参考:[`docs/source/Zh/doc/new_features/v153_features_doc.rst`](../docs/source/Zh/doc/new_features/v153_features_doc.rst)。 + +- **`validate_action` / `snap_to_element` / `in_bounds`**(`AC_validate_action`):`guardrail` 扫文字、`loop_guard` 检测循环——两者都不在派发前验证坐标动作,所以幻觉 `(9999,-5)` 点击会打到空处、偏 5px 的点击会错过。本功能拒绝屏幕外坐标,并在提供 `targets` 时把接近偏离者吸附到最近元素中心,返回 `{ok, reason, snapped}`。纯标准库几何,作用于元素字典;执行器 `screen` 默认为实际屏幕。可无头测试;接在 agent 循环派发之前。 + ## 本次更新 (2026-06-23) — 符记预算内的无障碍文字观测 把无障碍树转成 VLM 可操作的已编号文字区块。完整参考:[`docs/source/Zh/doc/new_features/v152_features_doc.rst`](../docs/source/Zh/doc/new_features/v152_features_doc.rst)。 diff --git a/README/WHATS_NEW_zh-TW.md b/README/WHATS_NEW_zh-TW.md index c86f1155..6cd840d6 100644 --- a/README/WHATS_NEW_zh-TW.md +++ b/README/WHATS_NEW_zh-TW.md @@ -1,5 +1,11 @@ # 本次更新 — AutoControl +## 本次更新 (2026-06-23) — 動作前接地防護 + +拒絕越界點擊;把接近偏離者吸附到真正的元素。完整參考:[`docs/source/Zh/doc/new_features/v153_features_doc.rst`](../docs/source/Zh/doc/new_features/v153_features_doc.rst)。 + +- **`validate_action` / `snap_to_element` / `in_bounds`**(`AC_validate_action`):`guardrail` 掃文字、`loop_guard` 偵測迴圈——兩者都不在派發前驗證座標動作,所以幻覺 `(9999,-5)` 點擊會打到空處、偏 5px 的點擊會錯過。本功能拒絕螢幕外座標,並在提供 `targets` 時把接近偏離者吸附到最近元素中心,回傳 `{ok, reason, snapped}`。純標準函式庫幾何,作用於元素字典;執行器 `screen` 預設為實際螢幕。可無頭測試;接在 agent 迴圈派發之前。 + ## 本次更新 (2026-06-23) — 符記預算內的無障礙文字觀測 把無障礙樹轉成 VLM 可操作的已編號文字區塊。完整參考:[`docs/source/Zh/doc/new_features/v152_features_doc.rst`](../docs/source/Zh/doc/new_features/v152_features_doc.rst)。 diff --git a/WHATS_NEW.md b/WHATS_NEW.md index 5180b364..c68df285 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -1,5 +1,11 @@ # What's New — AutoControl +## What's new (2026-06-23) — Pre-Action Grounding Guard + +Reject out-of-bounds clicks; snap near-misses onto the real element. Full reference: [`docs/source/Eng/doc/new_features/v153_features_doc.rst`](docs/source/Eng/doc/new_features/v153_features_doc.rst). + +- **`validate_action` / `snap_to_element` / `in_bounds`** (`AC_validate_action`): `guardrail` scans text and `loop_guard` detects loops — neither validates a coordinate action before dispatch, so a hallucinated `(9999,-5)` click fires into nothing and a 5px-off click misses. This rejects off-screen coordinates and, given `targets`, snaps a near-miss onto the nearest element's centre, returning `{ok, reason, snapped}`. Pure-stdlib geometry over element dicts; the executor `screen` defaults to the live screen. Headless-testable; plugs in front of an agent loop's dispatch. + ## What's new (2026-06-23) — Token-Budgeted A11y Text Observation Turn the a11y tree into an indexed text block a VLM can act on. Full reference: [`docs/source/Eng/doc/new_features/v152_features_doc.rst`](docs/source/Eng/doc/new_features/v152_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v153_features_doc.rst b/docs/source/Eng/doc/new_features/v153_features_doc.rst new file mode 100644 index 00000000..982aeae6 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v153_features_doc.rst @@ -0,0 +1,40 @@ +Pre-Action Grounding Guard +========================== + +``guardrail`` scans text for prompt-injection and ``loop_guard`` detects stuck loops — +but neither validates a *coordinate action* before it is dispatched. An agent loop +executes whatever the model returns with no bounds or target check, so a hallucinated +``(9999, -5)`` click fires into nothing and a 5-pixel-off click misses the button. +``validate_action`` adds the "detect misaligned actions before execution" guard: reject +clicks outside the screen and snap a near-miss coordinate onto the nearest known +element's centre. + +Pure-stdlib geometry over plain element dicts (``x`` / ``y`` / ``width`` / ``height``), +so it is fully unit-testable. Imports no ``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import validate_action, snap_to_element, in_bounds + + check = validate_action(model_action, screen_size=(1920, 1080), targets=elements) + if not check["ok"]: + print("rejected:", check["reason"]) # e.g. "out of bounds" + else: + x, y = check["snapped"] or (model_action["x"], model_action["y"]) + click(x, y) # snapped onto the real button + +``in_bounds(x, y, screen_size)`` is the screen-bounds predicate; ``snap_to_element`` +returns the centre of the element at (or nearest within ``max_dist`` of) a point, or +``None``; ``validate_action`` combines them, returning ``{ok, reason, snapped}`` — +rejecting out-of-bounds coordinates and snapping near-misses when ``targets`` are +supplied. Actions without a coordinate always pass. + +Executor command +---------------- + +``AC_validate_action`` (``action`` / ``screen`` / ``targets`` → ``{ok, reason, +snapped}``; ``screen`` defaults to the live screen). It is exposed as the MCP tool +``ac_validate_action`` and as a Script Builder command under **Native UI**. diff --git a/docs/source/Eng/eng_index.rst b/docs/source/Eng/eng_index.rst index 65134512..958399bf 100644 --- a/docs/source/Eng/eng_index.rst +++ b/docs/source/Eng/eng_index.rst @@ -175,6 +175,7 @@ Comprehensive guides for all AutoControl features. doc/new_features/v150_features_doc doc/new_features/v151_features_doc doc/new_features/v152_features_doc + doc/new_features/v153_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/docs/source/Zh/doc/new_features/v153_features_doc.rst b/docs/source/Zh/doc/new_features/v153_features_doc.rst new file mode 100644 index 00000000..2258762c --- /dev/null +++ b/docs/source/Zh/doc/new_features/v153_features_doc.rst @@ -0,0 +1,33 @@ +動作前接地防護 +============== + +``guardrail`` 掃描文字找提示注入、``loop_guard`` 偵測卡住的迴圈——但兩者都不在派發前驗證*座標動作*。agent 迴圈會 +執行模型回傳的任何東西,毫無邊界或目標檢查,因此幻覺出的 ``(9999, -5)`` 點擊會打到空處,而偏 5 像素的點擊會錯過 +按鈕。``validate_action`` 加入「執行前偵測錯位動作」防護:拒絕螢幕外點擊,並把接近但偏離的座標吸附到最近已知元素 +的中心。 + +純標準函式庫幾何,作用於純元素字典(``x`` / ``y`` / ``width`` / ``height``),因此完全可單元測試。不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import validate_action, snap_to_element, in_bounds + + check = validate_action(model_action, screen_size=(1920, 1080), targets=elements) + if not check["ok"]: + print("rejected:", check["reason"]) # 例如 "out of bounds" + else: + x, y = check["snapped"] or (model_action["x"], model_action["y"]) + click(x, y) # 已吸附到真正的按鈕 + +``in_bounds(x, y, screen_size)`` 是螢幕邊界判斷式;``snap_to_element`` 回傳某點所在(或在 ``max_dist`` 內最近) +元素的中心,否則 ``None``;``validate_action`` 結合兩者,回傳 ``{ok, reason, snapped}``——拒絕越界座標,並在提供 +``targets`` 時吸附接近偏離者。沒有座標的動作一律通過。 + +執行器命令 +---------- + +``AC_validate_action``(``action`` / ``screen`` / ``targets`` → ``{ok, reason, snapped}``;``screen`` 預設為實際 +螢幕)。它以 MCP 工具 ``ac_validate_action`` 以及 Script Builder 中 **Native UI** 分類下的命令提供。 diff --git a/docs/source/Zh/zh_index.rst b/docs/source/Zh/zh_index.rst index b574c82b..be144076 100644 --- a/docs/source/Zh/zh_index.rst +++ b/docs/source/Zh/zh_index.rst @@ -175,6 +175,7 @@ AutoControl 所有功能的完整使用指南。 doc/new_features/v150_features_doc doc/new_features/v151_features_doc doc/new_features/v152_features_doc + doc/new_features/v153_features_doc doc/ocr_backends/ocr_backends_doc doc/observability/observability_doc doc/operations_layer/operations_layer_doc diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 93685ca2..41c65058 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -369,6 +369,10 @@ from je_auto_control.utils.observation import ( flatten_tree, observation_index, serialize_observation, ) +# Pre-action grounding guard (bounds check + snap-to-element) +from je_auto_control.utils.action_grounding import ( + in_bounds, snap_to_element, validate_action, +) # CI workflow annotations (GitHub Actions) from je_auto_control.utils.ci_annotations import ( emit_annotations, format_annotation, @@ -1250,6 +1254,9 @@ def start_autocontrol_gui(*args, **kwargs): "flatten_tree", "observation_index", "serialize_observation", + "in_bounds", + "snap_to_element", + "validate_action", "emit_annotations", "format_annotation", "ClipboardHistory", "default_clipboard_history", "analyze_heal_log", "heal_stats", "scan_secrets", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 81d515a2..12cc341f 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -2942,6 +2942,19 @@ def _add_set_of_marks_specs(specs: List[CommandSpec]) -> None: ), description="Reading-ordered, viewport-clipped, indexed element list.", )) + specs.append(CommandSpec( + "AC_validate_action", "Native UI", "Validate / Snap Action", + fields=( + FieldSpec("action", FieldType.STRING, + placeholder='{"type":"click","x":..,"y":..}'), + FieldSpec("screen", FieldType.STRING, optional=True, + placeholder="[width, height]"), + FieldSpec("targets", FieldType.STRING, optional=True, + placeholder='[{"x":..,"y":..,"width":..,"height":..}]'), + ), + description="Reject out-of-bounds clicks; snap a near-miss to the nearest " + "element.", + )) specs.append(CommandSpec( "AC_mark_screen", "Native UI", "Set-of-Marks: Number Elements", fields=( diff --git a/je_auto_control/utils/action_grounding/__init__.py b/je_auto_control/utils/action_grounding/__init__.py new file mode 100644 index 00000000..10a45ca7 --- /dev/null +++ b/je_auto_control/utils/action_grounding/__init__.py @@ -0,0 +1,6 @@ +"""Pre-action grounding guard (bounds check + snap-to-element).""" +from je_auto_control.utils.action_grounding.action_grounding import ( + in_bounds, snap_to_element, validate_action, +) + +__all__ = ["in_bounds", "snap_to_element", "validate_action"] diff --git a/je_auto_control/utils/action_grounding/action_grounding.py b/je_auto_control/utils/action_grounding/action_grounding.py new file mode 100644 index 00000000..de13756e --- /dev/null +++ b/je_auto_control/utils/action_grounding/action_grounding.py @@ -0,0 +1,74 @@ +"""Pre-action grounding guard — reject out-of-bounds clicks, snap near-misses. + +``guardrail`` scans text for prompt-injection and ``loop_guard`` detects stuck loops — +but neither validates a *coordinate action* before it is dispatched. An agent loop +executes whatever the model returns with no bounds or target check, so a hallucinated +``(9999, -5)`` click fires into nothing and a 5-pixel-off click misses the button. This +adds the "detect misaligned actions before execution" guard: reject clicks outside the +screen and snap a near-miss coordinate onto the nearest known element's centre. + +Pure-stdlib geometry over plain element dicts (``x`` / ``y`` / ``width`` / ``height``), +so it is fully unit-testable. Imports no ``PySide6``. +""" +import math +from typing import Any, Dict, List, Mapping, Optional, Sequence + +Element = Dict[str, Any] + + +def in_bounds(x: int, y: int, screen_size: Sequence[int]) -> bool: + """Whether ``(x, y)`` lies within the ``(width, height)`` screen.""" + width, height = int(screen_size[0]), int(screen_size[1]) + return 0 <= int(x) < width and 0 <= int(y) < height + + +def _center(element: Element) -> List[int]: + return [int(element["x"]) + int(element["width"]) // 2, + int(element["y"]) + int(element["height"]) // 2] + + +def _contains(element: Element, x: int, y: int) -> bool: + return (int(element["x"]) <= x < int(element["x"]) + int(element["width"]) + and int(element["y"]) <= y < int(element["y"]) + int(element["height"])) + + +def snap_to_element(x: int, y: int, elements: Sequence[Element], *, + max_dist: float = 8.0) -> Optional[List[int]]: + """Return the centre of the element at / nearest to ``(x, y)`` (or ``None``). + + A point inside an element snaps to that element's centre; otherwise the nearest + element centre within ``max_dist`` pixels is returned, else ``None``. + """ + px, py = int(x), int(y) + for element in elements: + if _contains(element, px, py): + return _center(element) + best: Optional[List[int]] = None + best_dist = float("inf") + for element in elements: + cx, cy = _center(element) + dist = math.hypot(cx - px, cy - py) + if dist < best_dist: + best_dist, best = dist, [cx, cy] + return best if best is not None and best_dist <= float(max_dist) else None + + +def validate_action(action: Mapping[str, Any], *, screen_size: Sequence[int], + targets: Optional[Sequence[Element]] = None) -> Dict[str, Any]: + """Validate a canonical action before dispatch; optionally snap to a target. + + Returns ``{ok, reason, snapped}``. A coordinate outside ``screen_size`` is + rejected (``ok=False``); when ``targets`` are given, a near-miss coordinate is + snapped onto the nearest element's centre (``snapped=[x, y]``). Actions without a + coordinate always pass. + """ + x, y = action.get("x"), action.get("y") + if x is None or y is None: + return {"ok": True, "reason": "no coordinate", "snapped": None} + if not in_bounds(x, y, screen_size): + return {"ok": False, "reason": "out of bounds", "snapped": None} + if targets: + snapped = snap_to_element(x, y, targets) + if snapped is not None: + return {"ok": True, "reason": "snapped", "snapped": snapped} + return {"ok": True, "reason": "in bounds", "snapped": None} diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index b41d64de..608b9fb5 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -3860,6 +3860,24 @@ def _observation_index(elements: Any, viewport: Any = None, return {"count": len(indexed), "elements": indexed} +def _validate_action(action: Any, screen: Any = None, + targets: Any = None) -> Dict[str, Any]: + """Adapter: validate a coordinate action (bounds + optional snap-to-target).""" + import json + from je_auto_control.utils.action_grounding import validate_action + if isinstance(action, str): + action = json.loads(action) + if isinstance(targets, str): + targets = json.loads(targets) if targets.strip() else None + if isinstance(screen, str): + screen = json.loads(screen) if screen.strip() else None + if not screen: + from je_auto_control.wrapper.auto_control_screen import screen_size + screen = list(screen_size()) + return validate_action(action, screen_size=screen, + targets=list(targets) if targets else None) + + def _with_modifiers(modifiers: Any, actions: Any) -> Dict[str, Any]: """Adapter: run nested actions while modifier keys are held down.""" import json @@ -5617,6 +5635,7 @@ def __init__(self): "AC_cua_command": _cua_command, "AC_serialize_observation": _serialize_observation, "AC_observation_index": _observation_index, + "AC_validate_action": _validate_action, "AC_tile_rect": _tile_rect, "AC_grid_rects": _grid_rects, "AC_cascade_rects": _cascade_rects, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 538df121..9db89be8 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -3309,6 +3309,26 @@ def observation_tools() -> List[MCPTool]: ] +def action_grounding_tools() -> List[MCPTool]: + return [ + MCPTool( + name="ac_validate_action", + description=("Validate a coordinate 'action' {type,x,y,…} before " + "dispatch: reject out-of-bounds clicks and, given 'targets' " + "(element boxes), snap a near-miss onto the nearest " + "element's centre. 'screen' [w,h] defaults to the live " + "screen. Returns {ok, reason, snapped}."), + input_schema=schema({ + "action": {"type": "object"}, + "screen": {"type": "array", "items": {"type": "integer"}}, + "targets": {"type": "array", "items": {"type": "object"}}}, + required=["action"]), + handler=h.validate_action, + annotations=READ_ONLY, + ), + ] + + def ssim_tools() -> List[MCPTool]: return [ MCPTool( @@ -6817,7 +6837,7 @@ def media_assert_tools() -> List[MCPTool]: locator_chain_tools, rich_clipboard_tools, img_histogram_tools, motion_regions_tools, window_zorder_tools, soft_assert_tools, perceptual_diff_tools, window_geometry_tools, cua_action_tools, - observation_tools, plugin_sdk_tools, governance_tools, + observation_tools, action_grounding_tools, plugin_sdk_tools, governance_tools, credential_lease_tools, egress_tools, approval_testing_tools, trajectory_eval_tools, compliance_tools, agent_trace_tools, video_report_tools, fuzzy_tools, artifact_store_tools, image_dedup_tools, diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 5a0a25e0..f0b9bd47 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -2320,6 +2320,11 @@ def observation_index(elements, viewport=None, max_elements=80): return _observation_index(elements, viewport, max_elements) +def validate_action(action, screen=None, targets=None): + from je_auto_control.utils.executor.action_executor import _validate_action + return _validate_action(action, screen, targets) + + def detect_drift(reference, current, threshold=0.25, bins=10): from je_auto_control.utils.executor.action_executor import _detect_drift return _detect_drift(reference, current, threshold, bins) diff --git a/test/unit_test/headless/test_action_grounding_batch.py b/test/unit_test/headless/test_action_grounding_batch.py new file mode 100644 index 00000000..b22df60b --- /dev/null +++ b/test/unit_test/headless/test_action_grounding_batch.py @@ -0,0 +1,62 @@ +"""Headless tests for the pre-action grounding guard. No Qt.""" +import je_auto_control as ac +from je_auto_control.utils.action_grounding import ( + in_bounds, snap_to_element, validate_action, +) + +_ELEMENTS = [{"x": 100, "y": 100, "width": 40, "height": 20}, + {"x": 300, "y": 200, "width": 60, "height": 30}] + + +def test_in_bounds(): + assert in_bounds(50, 50, (1920, 1080)) is True + assert in_bounds(9999, 5, (1920, 1080)) is False + assert in_bounds(-1, 5, (1920, 1080)) is False + + +def test_snap_inside_and_near_and_far(): + assert snap_to_element(110, 108, _ELEMENTS) == [120, 110] # inside el1 + assert snap_to_element(122, 112, _ELEMENTS, max_dist=8) == [120, 110] + assert snap_to_element(500, 500, _ELEMENTS, max_dist=8) is None + + +def test_validate_rejects_out_of_bounds(): + result = validate_action({"type": "click", "x": 9999, "y": 5}, + screen_size=(1920, 1080)) + assert result["ok"] is False and result["reason"] == "out of bounds" + + +def test_validate_snaps_near_miss(): + result = validate_action({"type": "click", "x": 118, "y": 109}, + screen_size=(1920, 1080), targets=_ELEMENTS) + assert result["ok"] is True and result["snapped"] == [120, 110] + + +def test_validate_in_bounds_no_snap(): + result = validate_action({"type": "click", "x": 500, "y": 500}, + screen_size=(1920, 1080), targets=_ELEMENTS) + assert result["ok"] is True and result["reason"] == "in bounds" + assert result["snapped"] is None + + +def test_validate_no_coordinate_passes(): + result = validate_action({"type": "type", "text": "hi"}, + screen_size=(1920, 1080)) + assert result["ok"] is True and result["reason"] == "no coordinate" + + +# --- wiring --------------------------------------------------------------- + +def test_wiring(): + assert "AC_validate_action" in set(ac.executor.known_commands()) + from je_auto_control.utils.mcp_server.tools import build_default_tool_registry + names = {t.name for t in build_default_tool_registry()} + assert "ac_validate_action" in names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert "AC_validate_action" in specs + + +def test_facade_exports(): + for attr in ("in_bounds", "snap_to_element", "validate_action"): + assert hasattr(ac, attr) and attr in ac.__all__