From 6c826a4976681e61bd4934425c190d7ce42eb242 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 26 Jun 2026 09:49:25 +0800
Subject: [PATCH 1/3] Add change_localize: attribute a screen change to the
 element boxes that changed

Existing diffs return raw pixel regions or a11y-element diffs; the gap is
'given a frame diff and a list of element boxes, which of those changed?'.
localize_changes diffs reference vs current and scores each element box by
its mean per-pixel change; rank_changes is the pure ranker (changed when
score >= threshold, sorted most-changed first). cv2/numpy lazy.
---
 WHATS_NEW.md                                  |  6 ++
 .../doc/new_features/v218_features_doc.rst    | 50 ++++++++++++
 .../Zh/doc/new_features/v218_features_doc.rst | 44 +++++++++++
 je_auto_control/__init__.py                   |  3 +
 .../gui/script_builder/command_schema.py      | 26 ++++++
 .../utils/change_localize/__init__.py         |  6 ++
 .../utils/change_localize/change_localize.py  | 74 +++++++++++++++++
 .../utils/executor/action_executor.py         | 22 ++++++
 .../utils/mcp_server/tools/_factories.py      | 30 +++++++
 .../utils/mcp_server/tools/_handlers.py       | 13 +++
 .../headless/test_change_localize_batch.py    | 79 +++++++++++++++++++
 11 files changed, 353 insertions(+)
 create mode 100644 docs/source/Eng/doc/new_features/v218_features_doc.rst
 create mode 100644 docs/source/Zh/doc/new_features/v218_features_doc.rst
 create mode 100644 je_auto_control/utils/change_localize/__init__.py
 create mode 100644 je_auto_control/utils/change_localize/change_localize.py
 create mode 100644 test/unit_test/headless/test_change_localize_batch.py

diff --git a/WHATS_NEW.md b/WHATS_NEW.md
index 25fe4b24..3ebd77c4 100644
--- a/WHATS_NEW.md
+++ b/WHATS_NEW.md
@@ -2,6 +2,12 @@
 
 ## What's new (2026-06-26)
 
+### Localize a Change to the Elements That Changed
+
+Turn a raw screen diff into "element 3 changed" by scoring a list of element boxes. Full reference: [`docs/source/Eng/doc/new_features/v218_features_doc.rst`](docs/source/Eng/doc/new_features/v218_features_doc.rst).
+
+- **`localize_changes` / `rank_changes`** (`AC_localize_changes`, `AC_rank_changes`): existing diffs answer *where* pixels changed (`motion_regions`, `perceptual_diff`, `ssim_changed_regions` → raw pixel regions) or which *accessibility* elements differ (`element_diff`, needs metadata) — but not "given a frame diff **and a list of element boxes**, which of *those* changed?". `localize_changes` diffs a reference against the current screen and scores each supplied element box by its mean per-pixel change; `rank_changes` is the pure ranker that flags `changed` (score ≥ `threshold`) and sorts most-changed first. Pairs with `set_of_marks`/accessibility boxes to give a per-element "what changed" feedback signal after a click. cv2/numpy imported lazily; ranking is pure and fully testable. Fifth feature of the ROUND-15 perception lane. No `PySide6`.
+
 ### Theme-Invariant Matching (Light Template, Dark Mode)
 
 Find a button captured in light mode even after the app switches to dark mode. Full reference: [`docs/source/Eng/doc/new_features/v217_features_doc.rst`](docs/source/Eng/doc/new_features/v217_features_doc.rst).
diff --git a/docs/source/Eng/doc/new_features/v218_features_doc.rst b/docs/source/Eng/doc/new_features/v218_features_doc.rst
new file mode 100644
index 00000000..694ce0dd
--- /dev/null
+++ b/docs/source/Eng/doc/new_features/v218_features_doc.rst
@@ -0,0 +1,50 @@
+Localize a Change to the Elements That Changed
+==============================================
+
+The existing diffs answer "*where* did pixels change" (``motion_regions``,
+``perceptual_diff``, ``ssim_changed_regions`` return raw pixel regions) or "which
+*accessibility* elements differ" (``element_diff``, needs a11y metadata). The
+missing middle is: given a frame diff **and a list of element boxes**, which of
+*those* elements changed? ``change_localize`` scores each supplied box by how
+much it changed and ranks them.
+
+* :func:`rank_changes` — pure: take ``[{box, score}]`` and mark each box
+  ``changed`` (score at or above ``threshold``), sorted most-changed first.
+* :func:`localize_changes` — diff a reference against the current screen, score
+  each element box by its mean pixel change, and rank them.
+
+``cv2`` / ``numpy`` are imported lazily (the module stays importable without
+them) and the loaders reuse :mod:`visual_match`. The ranking is pure and fully
+testable. Imports no ``PySide6``.
+
+Headless API
+------------
+
+.. code-block:: python
+
+    from je_auto_control import localize_changes, rank_changes, mark_elements
+
+    boxes = [mark["bbox"] for mark in mark_elements(elements)]
+
+    # After an action, which of those elements actually changed?
+    changed = localize_changes("before.png", boxes, current="after.png")
+    for entry in changed:
+        if entry["changed"]:
+            print("element changed:", entry["box"], entry["score"])
+
+    # Or rank pre-computed scores yourself:
+    rank_changes([{"box": [0, 0, 40, 20], "score": 0.6}], threshold=0.1)
+
+``localize_changes`` returns ``[{box, score, changed}]`` sorted most-changed
+first, where ``score`` is the box's mean per-pixel change (0..1). It pairs with
+``set_of_marks`` / accessibility element boxes to turn a raw screen diff into a
+per-element "what changed" signal — an agent feedback channel after a click.
+
+Executor commands
+-----------------
+
+``AC_localize_changes`` (``reference`` + ``boxes`` JSON list + ``current`` /
+``threshold`` / ``region`` → ``{changes}``) and ``AC_rank_changes``
+(``scored_boxes`` JSON list + ``threshold`` → ``{changes}``, pure). They are the
+matching read-only ``ac_*`` MCP tools and Script Builder commands under
+**Image**.
diff --git a/docs/source/Zh/doc/new_features/v218_features_doc.rst b/docs/source/Zh/doc/new_features/v218_features_doc.rst
new file mode 100644
index 00000000..bf69978d
--- /dev/null
+++ b/docs/source/Zh/doc/new_features/v218_features_doc.rst
@@ -0,0 +1,44 @@
+把變化歸因到實際改變的元素
+==========================
+
+既有的 diff 回答「像素在*哪裡*改變」(``motion_regions``、``perceptual_diff``、
+``ssim_changed_regions`` 回傳原始像素區域),或「哪些*無障礙*元素不同」(``element_diff``,需 a11y 中介資料)。
+缺少的中段是:給定一個畫面 diff **與一份元素方框清單**,*那些*元素中哪些改變了?``change_localize`` 依
+每個提供的方框改變多少評分並排序。
+
+* :func:`rank_changes` ——純函式:接受 ``[{box, score}]`` 並把每個方框標記為 ``changed``
+  (分數達到或超過 ``threshold``),依改變最多排在最前。
+* :func:`localize_changes` ——把參考影像對目前螢幕做 diff,依每個元素方框的平均像素改變評分,再排序。
+
+``cv2`` / ``numpy`` 採延遲匯入(模組無需它們即可匯入),載入器重用 :mod:`visual_match`。
+排序為純函式且可完整測試。不匯入 ``PySide6``。
+
+無頭 API
+--------
+
+.. code-block:: python
+
+    from je_auto_control import localize_changes, rank_changes, mark_elements
+
+    boxes = [mark["bbox"] for mark in mark_elements(elements)]
+
+    # 某動作後,那些元素中哪些真的改變了?
+    changed = localize_changes("before.png", boxes, current="after.png")
+    for entry in changed:
+        if entry["changed"]:
+            print("元素改變:", entry["box"], entry["score"])
+
+    # 或自行排序預先算好的分數:
+    rank_changes([{"box": [0, 0, 40, 20], "score": 0.6}], threshold=0.1)
+
+``localize_changes`` 回傳 ``[{box, score, changed}]`` 依改變最多排序,``score`` 是方框的平均
+逐像素改變(0..1)。它與 ``set_of_marks`` / 無障礙元素方框搭配,把原始螢幕 diff 轉成逐元素的
+「什麼改變了」訊號——點擊後的 agent 回饋通道。
+
+執行器指令
+----------
+
+``AC_localize_changes``(``reference`` 加上 ``boxes`` JSON 清單加上 ``current`` /
+``threshold`` / ``region`` → ``{changes}``)與 ``AC_rank_changes``(``scored_boxes`` JSON 清單加上
+``threshold`` → ``{changes}``,純函式)。皆以對應的唯讀 ``ac_*`` MCP 工具及 Script Builder 指令
+(位於 **Image** 分類下)形式提供。
diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py
index bfb1a716..8686170d 100644
--- a/je_auto_control/__init__.py
+++ b/je_auto_control/__init__.py
@@ -143,6 +143,8 @@
 )
 # Theme-invariant matching so a light template matches dark mode
 from je_auto_control.utils.theme_normalize import match_theme, normalize_theme
+# Attribute a screen change to the specific element boxes that changed
+from je_auto_control.utils.change_localize import localize_changes, rank_changes
 # Rich clipboard formats — RTF + CSV/TSV codecs and Windows get / set
 from je_auto_control.utils.clipboard_rich_formats import (
     build_rtf, csv_to_rows, get_clipboard_csv, get_clipboard_rtf, rows_to_csv,
@@ -1771,6 +1773,7 @@ def start_autocontrol_gui(*args, **kwargs):
     "place_labels", "label_color",
     "grade_contrast", "dominant_pair", "region_contrast",
     "normalize_theme", "match_theme",
+    "localize_changes", "rank_changes",
     "build_rtf", "rtf_to_text", "rows_to_csv", "csv_to_rows",
     "set_clipboard_rtf", "get_clipboard_rtf",
     "set_clipboard_csv", "get_clipboard_csv",
diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py
index 1d1f6f0f..ac0cf971 100644
--- a/je_auto_control/gui/script_builder/command_schema.py
+++ b/je_auto_control/gui/script_builder/command_schema.py
@@ -4606,6 +4606,32 @@ def _add_work_queue_specs(specs: List[CommandSpec]) -> None:
         ),
         description="Locate a template across a light/dark theme flip.",
     ))
+    specs.append(CommandSpec(
+        "AC_rank_changes", "Image", "Rank Changed Boxes",
+        fields=(
+            FieldSpec("scored_boxes", FieldType.STRING,
+                      placeholder="JSON list of {box, score}"),
+            FieldSpec("threshold", FieldType.FLOAT, optional=True,
+                      default=0.1),
+        ),
+        description="Rank scored element boxes by how much they changed.",
+    ))
+    specs.append(CommandSpec(
+        "AC_localize_changes", "Image", "Localize Changed Elements",
+        fields=(
+            FieldSpec("reference", FieldType.STRING,
+                      placeholder="reference image path"),
+            FieldSpec("boxes", FieldType.STRING,
+                      placeholder="JSON list of [x, y, w, h]"),
+            FieldSpec("current", FieldType.STRING, optional=True,
+                      placeholder="current image path (else screen)"),
+            FieldSpec("threshold", FieldType.FLOAT, optional=True,
+                      default=0.1),
+            FieldSpec("region", FieldType.STRING, optional=True,
+                      placeholder="[x, y, w, h]"),
+        ),
+        description="Rank which element boxes changed between two frames.",
+    ))
     specs.append(CommandSpec(
         "AC_normalize_ext", "Shell", "Normalize Extension",
         fields=(
diff --git a/je_auto_control/utils/change_localize/__init__.py b/je_auto_control/utils/change_localize/__init__.py
new file mode 100644
index 00000000..c11b398b
--- /dev/null
+++ b/je_auto_control/utils/change_localize/__init__.py
@@ -0,0 +1,6 @@
+"""Attribute a screen change to the specific element boxes that changed."""
+from je_auto_control.utils.change_localize.change_localize import (
+    localize_changes, rank_changes,
+)
+
+__all__ = ["localize_changes", "rank_changes"]
diff --git a/je_auto_control/utils/change_localize/change_localize.py b/je_auto_control/utils/change_localize/change_localize.py
new file mode 100644
index 00000000..efb38cc6
--- /dev/null
+++ b/je_auto_control/utils/change_localize/change_localize.py
@@ -0,0 +1,74 @@
+"""Attribute a screen change to the specific elements that changed.
+
+The existing diffs answer "*where* did pixels change" (``motion_regions``,
+``perceptual_diff``, ``ssim_changed_regions`` return raw pixel regions) or "which
+*accessibility* elements differ" (``element_diff``, needs a11y metadata). The
+missing middle is: given a frame diff **and a list of element boxes**, which of
+*those* elements changed? ``change_localize`` scores each supplied box by how
+much it changed and ranks them.
+
+* :func:`rank_changes` — pure: take ``[{box, score}]`` and mark each box
+  ``changed`` (score at or above ``threshold``), sorted most-changed first.
+* :func:`localize_changes` — diff a reference against the current screen, score
+  each element box by its mean pixel change, and rank them.
+
+cv2 / numpy are imported lazily (the module stays importable without them) and
+the loaders reuse :mod:`visual_match`. The ranking is pure and fully testable.
+Imports no ``PySide6``.
+"""
+from typing import Any, Dict, List, Optional, Sequence
+
+
+def _unpack(item: Any) -> tuple:
+    """Return ``(box, score)`` from a ``{box, score}`` dict or a ``(box, score)``."""
+    if isinstance(item, dict):
+        return item["box"], item["score"]
+    return item[0], item[1]
+
+
+def rank_changes(scored_boxes: Sequence[Any], *,
+                 threshold: float = 0.1) -> List[Dict[str, Any]]:
+    """Mark and rank scored element boxes by how much they changed (pure).
+
+    ``scored_boxes`` is a sequence of ``{box, score}`` (or ``(box, score)``).
+    Returns ``[{box, score, changed}]`` sorted by descending score; ``changed``
+    is ``True`` when the score is at or above ``threshold``.
+    """
+    limit = float(threshold)
+    result = [
+        {"box": [int(value) for value in box],
+         "score": round(float(score), 4),
+         "changed": float(score) >= limit}
+        for box, score in (_unpack(item) for item in scored_boxes)
+    ]
+    result.sort(key=lambda entry: entry["score"], reverse=True)
+    return result
+
+
+def _box_mean(diff: Any, box: Sequence[int]) -> float:
+    """Mean change (0..1) of the diff map inside ``box`` (numpy)."""
+    x, y, w, h = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
+    patch = diff[max(0, y):y + h, max(0, x):x + w]
+    return float(patch.mean()) if patch.size else 0.0
+
+
+def localize_changes(reference: Any, boxes: Sequence[Sequence[int]], *,
+                     current: Optional[Any] = None, threshold: float = 0.1,
+                     region: Optional[Sequence[int]] = None
+                     ) -> List[Dict[str, Any]]:
+    """Score and rank which of ``boxes`` changed between two frames.
+
+    Diffs ``reference`` against ``current`` (a fresh screen grab of ``region``
+    by default), takes each box's mean per-pixel change (0..1), and ranks them
+    via :func:`rank_changes`. Returns ``[{box, score, changed}]``.
+    """
+    import numpy as np
+    from je_auto_control.utils.visual_match.visual_match import (
+        _grab_gray, _to_gray)
+    ref = _to_gray(reference).astype("float64")
+    other = current if current is not None else _grab_gray(region)
+    cur = _to_gray(other).astype("float64")
+    diff = np.abs(ref - cur) / 255.0
+    scored = [{"box": list(box), "score": _box_mean(diff, box)}
+              for box in boxes]
+    return rank_changes(scored, threshold=threshold)
diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py
index 6ba236f4..fc271047 100644
--- a/je_auto_control/utils/executor/action_executor.py
+++ b/je_auto_control/utils/executor/action_executor.py
@@ -2912,6 +2912,26 @@ def _match_theme(template: Any, region: Any = None, method: Any = "sobel",
     return {"found": True, **match}
 
 
+def _rank_changes(scored_boxes: Any, threshold: Any = 0.1) -> Dict[str, Any]:
+    """Adapter: rank scored element boxes by how much they changed (pure)."""
+    from je_auto_control.utils.change_localize import rank_changes
+    items = _coerce_list(scored_boxes) if scored_boxes else []
+    return {"changes": rank_changes(items, threshold=float(threshold))}
+
+
+def _localize_changes(reference: Any, boxes: Any, current: Any = None,
+                      threshold: Any = 0.1, region: Any = None
+                      ) -> Dict[str, Any]:
+    """Adapter: rank which element boxes changed between two frames (device)."""
+    from je_auto_control.utils.change_localize import localize_changes
+    box_list = _coerce_list(boxes) if boxes else []
+    changes = localize_changes(str(reference), box_list,
+                               current=str(current) if current else None,
+                               threshold=float(threshold),
+                               region=_coerce_region(region))
+    return {"changes": changes}
+
+
 def _normalize_ext(target: str) -> Dict[str, Any]:
     """Adapter: the lowercased extension of a path / bare ext (pure)."""
     from je_auto_control.utils.file_assoc import normalize_ext
@@ -6951,6 +6971,8 @@ def __init__(self):
             "AC_dominant_pair": _dominant_pair,
             "AC_region_contrast": _region_contrast,
             "AC_match_theme": _match_theme,
+            "AC_rank_changes": _rank_changes,
+            "AC_localize_changes": _localize_changes,
             "AC_normalize_ext": _normalize_ext,
             "AC_file_association": _file_association,
             "AC_get_control_text": _get_control_text,
diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py
index 4369522b..41e8400d 100644
--- a/je_auto_control/utils/mcp_server/tools/_factories.py
+++ b/je_auto_control/utils/mcp_server/tools/_factories.py
@@ -4107,6 +4107,36 @@ def img_histogram_tools() -> List[MCPTool]:
             handler=h.match_theme,
             annotations=READ_ONLY,
         ),
+        MCPTool(
+            name="ac_rank_changes",
+            description=("Rank scored element boxes by how much they changed. "
+                         "'scored_boxes' is a list of {box:[x,y,w,h], score}. "
+                         "Pure. Returns {changes:[{box, score, changed}]} "
+                         "sorted most-changed first."),
+            input_schema=schema({"scored_boxes": {"type": "array",
+                                                 "items": {"type": "object"}},
+                                 "threshold": {"type": "number"}},
+                                required=["scored_boxes"]),
+            handler=h.rank_changes,
+            annotations=READ_ONLY,
+        ),
+        MCPTool(
+            name="ac_localize_changes",
+            description=("Which of the supplied element 'boxes' changed between "
+                         "a 'reference' image and the current screen (or "
+                         "'current' image). Returns {changes:[{box, score, "
+                         "changed}]}."),
+            input_schema=schema({"reference": {"type": "string"},
+                                 "boxes": {"type": "array",
+                                          "items": {"type": "array"}},
+                                 "current": {"type": "string"},
+                                 "threshold": {"type": "number"},
+                                 "region": {"type": "array",
+                                           "items": {"type": "integer"}}},
+                                required=["reference", "boxes"]),
+            handler=h.localize_changes,
+            annotations=READ_ONLY,
+        ),
     ]
 
 
diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py
index 5006b182..a91d0644 100644
--- a/je_auto_control/utils/mcp_server/tools/_handlers.py
+++ b/je_auto_control/utils/mcp_server/tools/_handlers.py
@@ -774,6 +774,19 @@ def match_theme(template, region=None, method="sobel", min_score=0.5):
     return _match_theme(template, region, method, min_score)
 
 
+def rank_changes(scored_boxes, threshold=0.1):
+    from je_auto_control.utils.executor.action_executor import _rank_changes
+    return _rank_changes(scored_boxes, threshold)
+
+
+def localize_changes(reference, boxes, current=None, threshold=0.1,
+                     region=None):
+    from je_auto_control.utils.executor.action_executor import (
+        _localize_changes,
+    )
+    return _localize_changes(reference, boxes, current, threshold, region)
+
+
 def normalize_ext(target):
     from je_auto_control.utils.executor.action_executor import _normalize_ext
     return _normalize_ext(target)
diff --git a/test/unit_test/headless/test_change_localize_batch.py b/test/unit_test/headless/test_change_localize_batch.py
new file mode 100644
index 00000000..84110e53
--- /dev/null
+++ b/test/unit_test/headless/test_change_localize_batch.py
@@ -0,0 +1,79 @@
+"""Headless tests for change_localize (pure ranking + cv2 localization)."""
+import pytest
+
+import je_auto_control as ac
+from je_auto_control.utils.change_localize import localize_changes, rank_changes
+
+
+# --- pure rank_changes ----------------------------------------------------
+
+def test_rank_changes_marks_and_sorts():
+    scored = [{"box": [0, 0, 10, 10], "score": 0.02},
+              {"box": [20, 20, 10, 10], "score": 0.5},
+              {"box": [40, 40, 10, 10], "score": 0.2}]
+    ranked = rank_changes(scored, threshold=0.1)
+    # sorted most-changed first
+    assert [entry["score"] for entry in ranked] == pytest.approx([0.5, 0.2,
+                                                                  0.02])
+    assert [entry["changed"] for entry in ranked] == [True, True, False]
+
+
+def test_rank_changes_accepts_tuples():
+    ranked = rank_changes([([0, 0, 5, 5], 0.3), ([1, 1, 5, 5], 0.05)],
+                          threshold=0.1)
+    assert ranked[0]["changed"] is True
+    assert ranked[1]["changed"] is False
+
+
+def test_rank_changes_empty():
+    assert rank_changes([]) == []
+
+
+def test_rank_changes_threshold_boundary():
+    # a score exactly at the threshold counts as changed (>=)
+    ranked = rank_changes([{"box": [0, 0, 1, 1], "score": 0.1}], threshold=0.1)
+    assert ranked[0]["changed"] is True
+
+
+# --- cv2 localize_changes (per-function importorskip) ---------------------
+
+def test_localize_changes_attributes_to_the_right_box():
+    np = pytest.importorskip("numpy")
+    pytest.importorskip("cv2")
+    reference = np.zeros((100, 100), dtype="uint8")
+    current = reference.copy()
+    current[40:60, 40:60] = 255            # change inside this box only
+    boxes = [[40, 40, 20, 20], [0, 0, 20, 20]]
+    ranked = localize_changes(reference, boxes, current=current,
+                              threshold=0.05)
+    # the changed box ranks first and is flagged; the untouched one is not
+    assert ranked[0]["box"] == [40, 40, 20, 20]
+    assert ranked[0]["changed"] is True
+    untouched = [r for r in ranked if r["box"] == [0, 0, 20, 20]][0]
+    assert untouched["changed"] is False
+
+
+# --- wiring (cv2-free) ----------------------------------------------------
+
+def test_executor_pure_rank_path():
+    from je_auto_control.utils.executor.action_executor import _rank_changes
+    out = _rank_changes('[{"box": [0,0,4,4], "score": 0.4}]', 0.1)
+    assert out["changes"][0]["changed"] is True
+
+
+def test_wiring():
+    known = set(ac.executor.known_commands())
+    assert {"AC_rank_changes", "AC_localize_changes"} <= known
+    from je_auto_control.utils.mcp_server.tools import (
+        build_default_tool_registry,
+    )
+    names = {t.name for t in build_default_tool_registry()}
+    assert {"ac_rank_changes", "ac_localize_changes"} <= names
+    from je_auto_control.gui.script_builder.command_schema import _build_specs
+    specs = {s.command for s in _build_specs()}
+    assert {"AC_rank_changes", "AC_localize_changes"} <= specs
+
+
+def test_facade_exports():
+    for name in ("localize_changes", "rank_changes"):
+        assert hasattr(ac, name) and name in ac.__all__

From a802ab6f80de7cad71b35c02d9911366b5f38c11 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 26 Jun 2026 10:06:43 +0800
Subject: [PATCH 2/3] Add icon_classify: classify a widget from its pixel shape

Set-of-Marks/element proposers return boxes but not what each box is;
form_fields.checkbox_state reads a box already known to be a checkbox.
box_features extracts {aspect, fill, edge_density, circularity};
classify_widget is the pure heuristic classifier (round->radio,
wide-rounded->toggle, square-sparse->checkbox, wide-hollow->text_field,
wide-filled->button, else icon); classify_icon composes them. cv2 lazy.
---
 WHATS_NEW.md                                  |   6 +
 .../doc/new_features/v219_features_doc.rst    |  46 ++++++++
 .../Zh/doc/new_features/v219_features_doc.rst |  38 +++++++
 je_auto_control/__init__.py                   |   5 +
 .../gui/script_builder/command_schema.py      |  16 +++
 .../utils/executor/action_executor.py         |  16 +++
 .../utils/icon_classify/__init__.py           |   6 +
 .../utils/icon_classify/icon_classify.py      | 107 ++++++++++++++++++
 .../utils/mcp_server/tools/_factories.py      |  22 ++++
 .../utils/mcp_server/tools/_handlers.py       |  10 ++
 .../headless/test_icon_classify_batch.py      | 107 ++++++++++++++++++
 11 files changed, 379 insertions(+)
 create mode 100644 docs/source/Eng/doc/new_features/v219_features_doc.rst
 create mode 100644 docs/source/Zh/doc/new_features/v219_features_doc.rst
 create mode 100644 je_auto_control/utils/icon_classify/__init__.py
 create mode 100644 je_auto_control/utils/icon_classify/icon_classify.py
 create mode 100644 test/unit_test/headless/test_icon_classify_batch.py

diff --git a/WHATS_NEW.md b/WHATS_NEW.md
index 3ebd77c4..83c0dbe3 100644
--- a/WHATS_NEW.md
+++ b/WHATS_NEW.md
@@ -2,6 +2,12 @@
 
 ## What's new (2026-06-26)
 
+### Classify a Widget from Its Pixel Shape
+
+Tell a checkbox from a radio button from a text field — from pixels, no model. Full reference: [`docs/source/Eng/doc/new_features/v219_features_doc.rst`](docs/source/Eng/doc/new_features/v219_features_doc.rst).
+
+- **`classify_widget` / `box_features` / `classify_icon`** (`AC_classify_widget`, `AC_classify_icon`): Set-of-Marks and element proposers return *boxes* but not *what each box is*; `form_fields.checkbox_state` reads a box already known to be a checkbox — the gap is the typing step before it. `box_features` extracts `{aspect, fill, edge_density, circularity}` for a box; `classify_widget` is the pure heuristic classifier (round→radio, wide-rounded→toggle, square-sparse→checkbox, wide-hollow→text_field, wide-filled→button, else icon); `classify_icon` composes them. The classifier is pure and fully testable; cv2/numpy imported lazily so the module stays importable. Sixth feature of the ROUND-15 perception lane. No `PySide6`.
+
 ### Localize a Change to the Elements That Changed
 
 Turn a raw screen diff into "element 3 changed" by scoring a list of element boxes. Full reference: [`docs/source/Eng/doc/new_features/v218_features_doc.rst`](docs/source/Eng/doc/new_features/v218_features_doc.rst).
diff --git a/docs/source/Eng/doc/new_features/v219_features_doc.rst b/docs/source/Eng/doc/new_features/v219_features_doc.rst
new file mode 100644
index 00000000..8e595dde
--- /dev/null
+++ b/docs/source/Eng/doc/new_features/v219_features_doc.rst
@@ -0,0 +1,46 @@
+Classify a Widget from Its Pixel Shape
+======================================
+
+Set-of-Marks and element proposers hand back *boxes*, but not *what each box is*.
+``form_fields.checkbox_state`` already reads a box known to be a checkbox; the
+gap is the typing step before it — is this box a checkbox, a radio button, a push
+button, a text field or a toggle? ``icon_classify`` answers that from cheap
+geometric features (no model).
+
+* :func:`box_features` — extract ``{aspect, fill, edge_density, circularity}``
+  for a box region (the objective measurements).
+* :func:`classify_widget` — pure: map a feature dict to a widget type by
+  documented heuristics.
+* :func:`classify_icon` — compose the two: a box to ``{type, features}``.
+
+``classify_widget`` is pure and fully testable; ``box_features`` imports cv2 /
+numpy lazily (the module stays importable without them) and reuses
+:func:`visual_match._to_gray`. Imports no ``PySide6``.
+
+Headless API
+------------
+
+.. code-block:: python
+
+    from je_auto_control import classify_icon, classify_widget
+
+    # From a screenshot + a box:
+    classify_icon("dialog.png", [120, 80, 16, 16])
+    # {'type': 'checkbox', 'features': {'aspect': 1.0, 'fill': 0.12, ...}}
+
+    # From features you already have:
+    classify_widget({"aspect": 1.0, "circularity": 0.9, "fill": 0.4})  # 'radio'
+
+The heuristics: a round box (aspect ≈ 1, high circularity) is a ``radio``; a wide
+rounded box is a ``toggle``; a near-square sparse box is a ``checkbox``; a wide
+hollow box is a ``text_field``; a wide filled box is a ``button``; anything else
+is an ``icon``. Tune by reading ``features`` and applying your own rules where
+the defaults misfire — the measurements are the durable part.
+
+Executor commands
+-----------------
+
+``AC_classify_widget`` (``features`` JSON object → ``{type}``, pure) and
+``AC_classify_icon`` (``source`` image + ``box`` ``[x, y, w, h]`` →
+``{type, features}``). They are the matching read-only ``ac_*`` MCP tools and
+Script Builder commands under **Image**.
diff --git a/docs/source/Zh/doc/new_features/v219_features_doc.rst b/docs/source/Zh/doc/new_features/v219_features_doc.rst
new file mode 100644
index 00000000..00cfaf05
--- /dev/null
+++ b/docs/source/Zh/doc/new_features/v219_features_doc.rst
@@ -0,0 +1,38 @@
+從像素形狀分類控制項
+====================
+
+Set-of-Marks 與元素提案器回傳*方框*,卻不告訴你*每個方框是什麼*。``form_fields.checkbox_state``
+已能讀取一個已知是核取方塊的方框;缺少的是它之前的分類步驟——這個方框是核取方塊、單選鈕、按鈕、
+文字欄位還是切換開關?``icon_classify`` 從低成本的幾何特徵(無需模型)回答此問題。
+
+* :func:`box_features` ——擷取方框區域的 ``{aspect, fill, edge_density, circularity}``(客觀量測)。
+* :func:`classify_widget` ——純函式:以記載的啟發式規則把特徵字典映射為控制項型別。
+* :func:`classify_icon` ——組合兩者:把一個方框轉為 ``{type, features}``。
+
+``classify_widget`` 為純函式且可完整測試;``box_features`` 延遲匯入 cv2 / numpy(模組無需它們即可匯入),
+並重用 :func:`visual_match._to_gray`。不匯入 ``PySide6``。
+
+無頭 API
+--------
+
+.. code-block:: python
+
+    from je_auto_control import classify_icon, classify_widget
+
+    # 從截圖 + 方框:
+    classify_icon("dialog.png", [120, 80, 16, 16])
+    # {'type': 'checkbox', 'features': {'aspect': 1.0, 'fill': 0.12, ...}}
+
+    # 從你已有的特徵:
+    classify_widget({"aspect": 1.0, "circularity": 0.9, "fill": 0.4})  # 'radio'
+
+啟發式規則:圓形方框(aspect ≈ 1、高 circularity)為 ``radio``;寬且圓潤為 ``toggle``;
+近正方且稀疏為 ``checkbox``;寬且空心為 ``text_field``;寬且填滿為 ``button``;其餘為 ``icon``。
+在預設誤判處,可讀取 ``features`` 套用你自己的規則微調——量測值才是耐用的部分。
+
+執行器指令
+----------
+
+``AC_classify_widget``(``features`` JSON 物件 → ``{type}``,純函式)與
+``AC_classify_icon``(``source`` 影像 + ``box`` ``[x, y, w, h]`` → ``{type, features}``)。
+皆以對應的唯讀 ``ac_*`` MCP 工具及 Script Builder 指令(位於 **Image** 分類下)形式提供。
diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py
index 8686170d..54b3f178 100644
--- a/je_auto_control/__init__.py
+++ b/je_auto_control/__init__.py
@@ -145,6 +145,10 @@
 from je_auto_control.utils.theme_normalize import match_theme, normalize_theme
 # Attribute a screen change to the specific element boxes that changed
 from je_auto_control.utils.change_localize import localize_changes, rank_changes
+# Classify what kind of widget a box is from its pixel shape
+from je_auto_control.utils.icon_classify import (
+    box_features, classify_icon, classify_widget,
+)
 # Rich clipboard formats — RTF + CSV/TSV codecs and Windows get / set
 from je_auto_control.utils.clipboard_rich_formats import (
     build_rtf, csv_to_rows, get_clipboard_csv, get_clipboard_rtf, rows_to_csv,
@@ -1774,6 +1778,7 @@ def start_autocontrol_gui(*args, **kwargs):
     "grade_contrast", "dominant_pair", "region_contrast",
     "normalize_theme", "match_theme",
     "localize_changes", "rank_changes",
+    "classify_widget", "box_features", "classify_icon",
     "build_rtf", "rtf_to_text", "rows_to_csv", "csv_to_rows",
     "set_clipboard_rtf", "get_clipboard_rtf",
     "set_clipboard_csv", "get_clipboard_csv",
diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py
index ac0cf971..f642c163 100644
--- a/je_auto_control/gui/script_builder/command_schema.py
+++ b/je_auto_control/gui/script_builder/command_schema.py
@@ -4632,6 +4632,22 @@ def _add_work_queue_specs(specs: List[CommandSpec]) -> None:
         ),
         description="Rank which element boxes changed between two frames.",
     ))
+    specs.append(CommandSpec(
+        "AC_classify_widget", "Image", "Classify Widget (features)",
+        fields=(
+            FieldSpec("features", FieldType.STRING,
+                      placeholder="JSON {aspect, circularity, fill}"),
+        ),
+        description="Map geometric features to a widget type.",
+    ))
+    specs.append(CommandSpec(
+        "AC_classify_icon", "Image", "Classify Icon (box)",
+        fields=(
+            FieldSpec("source", FieldType.STRING, placeholder="image path"),
+            FieldSpec("box", FieldType.STRING, placeholder="[x, y, w, h]"),
+        ),
+        description="Classify the widget in an image box from its pixels.",
+    ))
     specs.append(CommandSpec(
         "AC_normalize_ext", "Shell", "Normalize Extension",
         fields=(
diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py
index fc271047..62168052 100644
--- a/je_auto_control/utils/executor/action_executor.py
+++ b/je_auto_control/utils/executor/action_executor.py
@@ -2932,6 +2932,20 @@ def _localize_changes(reference: Any, boxes: Any, current: Any = None,
     return {"changes": changes}
 
 
+def _classify_widget(features: Any) -> Dict[str, Any]:
+    """Adapter: map geometric features to a widget type (pure)."""
+    from je_auto_control.utils.icon_classify import classify_widget
+    import json
+    data = json.loads(features) if isinstance(features, str) else dict(features)
+    return {"type": classify_widget(data)}
+
+
+def _classify_icon(source: Any, box: Any) -> Dict[str, Any]:
+    """Adapter: classify the widget in a box from its pixels (device)."""
+    from je_auto_control.utils.icon_classify import classify_icon
+    return classify_icon(str(source), _coerce_list(box))
+
+
 def _normalize_ext(target: str) -> Dict[str, Any]:
     """Adapter: the lowercased extension of a path / bare ext (pure)."""
     from je_auto_control.utils.file_assoc import normalize_ext
@@ -6973,6 +6987,8 @@ def __init__(self):
             "AC_match_theme": _match_theme,
             "AC_rank_changes": _rank_changes,
             "AC_localize_changes": _localize_changes,
+            "AC_classify_widget": _classify_widget,
+            "AC_classify_icon": _classify_icon,
             "AC_normalize_ext": _normalize_ext,
             "AC_file_association": _file_association,
             "AC_get_control_text": _get_control_text,
diff --git a/je_auto_control/utils/icon_classify/__init__.py b/je_auto_control/utils/icon_classify/__init__.py
new file mode 100644
index 00000000..f7c35564
--- /dev/null
+++ b/je_auto_control/utils/icon_classify/__init__.py
@@ -0,0 +1,6 @@
+"""Classify what kind of widget a box is from its pixel shape."""
+from je_auto_control.utils.icon_classify.icon_classify import (
+    WIDGET_TYPES, box_features, classify_icon, classify_widget,
+)
+
+__all__ = ["classify_widget", "box_features", "classify_icon", "WIDGET_TYPES"]
diff --git a/je_auto_control/utils/icon_classify/icon_classify.py b/je_auto_control/utils/icon_classify/icon_classify.py
new file mode 100644
index 00000000..edd11c23
--- /dev/null
+++ b/je_auto_control/utils/icon_classify/icon_classify.py
@@ -0,0 +1,107 @@
+"""Classify what kind of widget a box is from its pixel shape.
+
+Set-of-Marks and element proposers hand back *boxes*, but not *what each box is*.
+``form_fields.checkbox_state`` already reads a box known to be a checkbox; the
+gap is the typing step before it — is this box a checkbox, a radio button, a
+push button, a text field or a toggle? ``icon_classify`` answers that from cheap
+geometric features (no model):
+
+* :func:`box_features` — extract ``{aspect, fill, edge_density, circularity}``
+  for a box region (the objective measurements).
+* :func:`classify_widget` — pure: map a feature dict to a widget type by
+  documented heuristics.
+* :func:`classify_icon` — compose the two: a box to ``{type, features}``.
+
+``classify_widget`` is pure and fully testable; ``box_features`` imports cv2 /
+numpy lazily (the module stays importable without them) and reuses
+:func:`visual_match._to_gray`. Imports no ``PySide6``.
+"""
+from typing import Any, Dict, Sequence
+
+# The widget types this classifier can return.
+WIDGET_TYPES = ("radio", "toggle", "checkbox", "text_field", "button", "icon")
+
+
+def _is_round(aspect: float, circ: float) -> bool:
+    """Near-square and circular (a radio button / round dot)."""
+    return 0.7 <= aspect <= 1.4 and circ >= 0.7
+
+
+def _is_pill(aspect: float, circ: float) -> bool:
+    """Wide and rounded (a toggle switch)."""
+    return 1.8 <= aspect <= 3.5 and circ >= 0.55
+
+
+def classify_widget(features: Dict[str, float]) -> str:
+    """Map geometric ``features`` to a widget type by heuristics (pure).
+
+    Uses ``aspect`` (w/h), ``circularity`` (1 = circle), and ``fill`` (ink
+    fraction). Round → ``radio``; wide & rounded → ``toggle``; near-square &
+    sparse → ``checkbox``; wide & hollow → ``text_field``; wide & filled →
+    ``button``; otherwise ``icon``.
+    """
+    aspect = float(features.get("aspect", 1.0))
+    circ = float(features.get("circularity", 0.0))
+    fill = float(features.get("fill", 0.0))
+    if _is_round(aspect, circ):
+        return "radio"
+    if _is_pill(aspect, circ):
+        return "toggle"
+    if 0.7 <= aspect <= 1.4 and fill <= 0.6:
+        return "checkbox"
+    if aspect >= 2.5 and fill <= 0.2:
+        return "text_field"
+    if aspect >= 1.5 and fill >= 0.2:
+        return "button"
+    return "icon"
+
+
+def _circularity(binary: Any) -> float:
+    """Circularity (``4*pi*A / P^2``, 1 = circle) of the largest blob."""
+    import math
+
+    import cv2
+    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL,
+                                   cv2.CHAIN_APPROX_SIMPLE)
+    if not contours:
+        return 0.0
+    largest = max(contours, key=cv2.contourArea)
+    area = float(cv2.contourArea(largest))
+    perimeter = float(cv2.arcLength(largest, True))
+    if perimeter <= 0.0:
+        return 0.0
+    return min(1.0, 4.0 * math.pi * area / (perimeter * perimeter))
+
+
+def box_features(source: Any, box: Sequence[int]) -> Dict[str, float]:
+    """Extract ``{aspect, fill, edge_density, circularity}`` for a box (cv2).
+
+    ``aspect`` is width/height, ``fill`` the ink fraction (Otsu foreground),
+    ``edge_density`` the Canny-edge fraction, ``circularity`` the largest blob's
+    roundness. An empty box yields all zeros.
+    """
+    import cv2
+    from je_auto_control.utils.visual_match.visual_match import _to_gray
+    gray = _to_gray(source)
+    x, y, w, h = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
+    patch = gray[max(0, y):y + h, max(0, x):x + w]
+    if patch.size == 0:
+        return {"aspect": 0.0, "fill": 0.0, "edge_density": 0.0,
+                "circularity": 0.0}
+    _, binary = cv2.threshold(patch, 0, 255,
+                              cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    fill = float((binary > 0).sum()) / patch.size
+    edges = cv2.Canny(patch, 50, 150)
+    edge_density = float((edges > 0).sum()) / patch.size
+    return {
+        "aspect": round(w / h, 3) if h else 0.0,
+        "fill": round(fill, 3),
+        "edge_density": round(edge_density, 3),
+        "circularity": round(_circularity(binary), 3),
+    }
+
+
+def classify_icon(source: Any, box: Sequence[int]) -> Dict[str, Any]:
+    """Classify the widget in a box from its pixels: ``{type, features}``."""
+    features = box_features(source, box)
+    return {"type": classify_widget(features), "features": features}
diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py
index 41e8400d..3b11e843 100644
--- a/je_auto_control/utils/mcp_server/tools/_factories.py
+++ b/je_auto_control/utils/mcp_server/tools/_factories.py
@@ -4137,6 +4137,28 @@ def img_histogram_tools() -> List[MCPTool]:
             handler=h.localize_changes,
             annotations=READ_ONLY,
         ),
+        MCPTool(
+            name="ac_classify_widget",
+            description=("Map geometric 'features' {aspect, circularity, fill} "
+                         "to a widget type (radio/toggle/checkbox/text_field/"
+                         "button/icon). Pure. Returns {type}."),
+            input_schema=schema({"features": {"type": "object"}},
+                                required=["features"]),
+            handler=h.classify_widget,
+            annotations=READ_ONLY,
+        ),
+        MCPTool(
+            name="ac_classify_icon",
+            description=("Classify the widget in a 'box' [x,y,w,h] of a "
+                         "'source' image from its pixel shape. Returns {type, "
+                         "features}."),
+            input_schema=schema({"source": {"type": "string"},
+                                 "box": {"type": "array",
+                                        "items": {"type": "integer"}}},
+                                required=["source", "box"]),
+            handler=h.classify_icon,
+            annotations=READ_ONLY,
+        ),
     ]
 
 
diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py
index a91d0644..66875681 100644
--- a/je_auto_control/utils/mcp_server/tools/_handlers.py
+++ b/je_auto_control/utils/mcp_server/tools/_handlers.py
@@ -787,6 +787,16 @@ def localize_changes(reference, boxes, current=None, threshold=0.1,
     return _localize_changes(reference, boxes, current, threshold, region)
 
 
+def classify_widget(features):
+    from je_auto_control.utils.executor.action_executor import _classify_widget
+    return _classify_widget(features)
+
+
+def classify_icon(source, box):
+    from je_auto_control.utils.executor.action_executor import _classify_icon
+    return _classify_icon(source, box)
+
+
 def normalize_ext(target):
     from je_auto_control.utils.executor.action_executor import _normalize_ext
     return _normalize_ext(target)
diff --git a/test/unit_test/headless/test_icon_classify_batch.py b/test/unit_test/headless/test_icon_classify_batch.py
new file mode 100644
index 00000000..63f65444
--- /dev/null
+++ b/test/unit_test/headless/test_icon_classify_batch.py
@@ -0,0 +1,107 @@
+"""Headless tests for icon_classify (pure classifier + cv2 features)."""
+import pytest
+
+import je_auto_control as ac
+from je_auto_control.utils.icon_classify import (
+    box_features, classify_icon, classify_widget,
+)
+
+
+# --- pure classify_widget -------------------------------------------------
+
+def test_classify_radio_round():
+    assert classify_widget(
+        {"aspect": 1.0, "circularity": 0.92, "fill": 0.4}) == "radio"
+
+
+def test_classify_toggle_wide_rounded():
+    assert classify_widget(
+        {"aspect": 2.4, "circularity": 0.6, "fill": 0.5}) == "toggle"
+
+
+def test_classify_checkbox_square_sparse():
+    assert classify_widget(
+        {"aspect": 1.0, "circularity": 0.2, "fill": 0.1}) == "checkbox"
+
+
+def test_classify_text_field_wide_hollow():
+    assert classify_widget(
+        {"aspect": 4.0, "circularity": 0.1, "fill": 0.05}) == "text_field"
+
+
+def test_classify_button_wide_filled():
+    assert classify_widget(
+        {"aspect": 2.0, "circularity": 0.2, "fill": 0.5}) == "button"
+
+
+def test_classify_icon_fallback():
+    assert classify_widget(
+        {"aspect": 1.1, "circularity": 0.3, "fill": 0.9}) == "icon"
+
+
+def test_classify_widget_defaults_dont_crash():
+    assert classify_widget({}) in ("checkbox", "icon")
+
+
+# --- cv2 box_features / classify_icon (per-function importorskip) ----------
+
+def test_box_features_circle_rounder_than_square():
+    np = pytest.importorskip("numpy")
+    cv2 = pytest.importorskip("cv2")
+    canvas = np.full((40, 40), 255, dtype="uint8")
+    cv2.circle(canvas, (20, 20), 14, 0, -1)
+    circle = box_features(canvas, [3, 3, 34, 34])
+    square_canvas = np.full((40, 40), 255, dtype="uint8")
+    cv2.rectangle(square_canvas, (6, 6), (34, 34), 0, -1)
+    square = box_features(square_canvas, [3, 3, 34, 34])
+    assert circle["circularity"] > square["circularity"]
+    assert circle["circularity"] > 0.8
+
+
+def test_classify_icon_detects_radio_from_pixels():
+    np = pytest.importorskip("numpy")
+    cv2 = pytest.importorskip("cv2")
+    canvas = np.full((40, 40), 255, dtype="uint8")
+    cv2.circle(canvas, (20, 20), 13, 0, -1)   # filled round dot
+    result = classify_icon(canvas, [4, 4, 32, 32])
+    assert result["type"] == "radio"
+    assert set(result["features"]) == {"aspect", "fill", "edge_density",
+                                       "circularity"}
+
+
+def test_box_features_empty_box():
+    pytest.importorskip("numpy")
+    pytest.importorskip("cv2")
+    import numpy as np
+    canvas = np.zeros((10, 10), dtype="uint8")
+    feats = box_features(canvas, [0, 0, 0, 0])
+    assert feats == {"aspect": 0.0, "fill": 0.0, "edge_density": 0.0,
+                     "circularity": 0.0}
+
+
+# --- wiring (cv2-free) ----------------------------------------------------
+
+def test_executor_pure_classify_path():
+    from je_auto_control.utils.executor.action_executor import (
+        _classify_widget,
+    )
+    out = _classify_widget('{"aspect": 1.0, "circularity": 0.9, "fill": 0.3}')
+    assert out["type"] == "radio"
+
+
+def test_wiring():
+    known = set(ac.executor.known_commands())
+    assert {"AC_classify_widget", "AC_classify_icon"} <= known
+    from je_auto_control.utils.mcp_server.tools import (
+        build_default_tool_registry,
+    )
+    names = {t.name for t in build_default_tool_registry()}
+    assert {"ac_classify_widget", "ac_classify_icon"} <= names
+    from je_auto_control.gui.script_builder.command_schema import _build_specs
+    specs = {s.command for s in _build_specs()}
+    assert {"AC_classify_widget", "AC_classify_icon"} <= specs
+
+
+def test_facade_exports():
+    for name in ("classify_widget", "box_features", "classify_icon"):
+        assert hasattr(ac, name) and name in ac.__all__

From 01ce41eee5ca0f294c9954e2a054471087b06286 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Fri, 26 Jun 2026 10:23:11 +0800
Subject: [PATCH 3/3] Add element_proposal: propose a clean element list from
 raw pixels

Set-of-Marks/observation/grounding assume you already have element boxes,
but a game/custom-drawn app/remote desktop has no accessibility tree.
propose_elements builds the top-of-funnel list from pixels: widget blobs
(Canny+morphology+connected_boxes) + text regions, fused via element_parse
(ocr>icon priority is the drop-widget-that-is-really-text cross-check),
reading-ordered and tagged text/widget. tag_kinds is the pure labeller.
---
 WHATS_NEW.md                                  |  6 ++
 .../doc/new_features/v220_features_doc.rst    | 51 +++++++++++
 .../Zh/doc/new_features/v220_features_doc.rst | 42 +++++++++
 je_auto_control/__init__.py                   |  3 +
 .../gui/script_builder/command_schema.py      | 19 +++++
 .../utils/element_proposal/__init__.py        |  6 ++
 .../element_proposal/element_proposal.py      | 80 +++++++++++++++++
 .../utils/executor/action_executor.py         | 19 +++++
 .../utils/mcp_server/tools/_factories.py      | 25 ++++++
 .../utils/mcp_server/tools/_handlers.py       | 12 +++
 .../headless/test_element_proposal_batch.py   | 85 +++++++++++++++++++
 11 files changed, 348 insertions(+)
 create mode 100644 docs/source/Eng/doc/new_features/v220_features_doc.rst
 create mode 100644 docs/source/Zh/doc/new_features/v220_features_doc.rst
 create mode 100644 je_auto_control/utils/element_proposal/__init__.py
 create mode 100644 je_auto_control/utils/element_proposal/element_proposal.py
 create mode 100644 test/unit_test/headless/test_element_proposal_batch.py

diff --git a/WHATS_NEW.md b/WHATS_NEW.md
index 83c0dbe3..18ba3291 100644
--- a/WHATS_NEW.md
+++ b/WHATS_NEW.md
@@ -2,6 +2,12 @@
 
 ## What's new (2026-06-26)
 
+### Template-Free Element Proposal (Pixels to Elements)
+
+Get a clean numbered element list straight from the screen when there's no accessibility tree. Full reference: [`docs/source/Eng/doc/new_features/v220_features_doc.rst`](docs/source/Eng/doc/new_features/v220_features_doc.rst).
+
+- **`propose_elements` / `tag_kinds`** (`AC_propose_elements`, `AC_tag_kinds`): Set-of-Marks, `observation` and the grounding helpers all assume you already have element boxes — but a game, a custom-drawn app or a remote desktop has no accessibility tree. `propose_elements` builds that top-of-funnel list from pixels: detect widget boxes (closed-edge blobs via Canny + morphology + `connected_boxes`) and text boxes (`text_regions.find_text_regions`), fuse them — the `element_parse` `ocr > icon` priority *is* the "drop widget-that-is-really-text" cross-check — and return them in reading order, each tagged `text` or `widget`. `tag_kinds` is the pure labeller. cv2 imported lazily; the labeller is fully testable. Seventh and final feature of the ROUND-15 perception lane. No `PySide6`.
+
 ### Classify a Widget from Its Pixel Shape
 
 Tell a checkbox from a radio button from a text field — from pixels, no model. Full reference: [`docs/source/Eng/doc/new_features/v219_features_doc.rst`](docs/source/Eng/doc/new_features/v219_features_doc.rst).
diff --git a/docs/source/Eng/doc/new_features/v220_features_doc.rst b/docs/source/Eng/doc/new_features/v220_features_doc.rst
new file mode 100644
index 00000000..17956790
--- /dev/null
+++ b/docs/source/Eng/doc/new_features/v220_features_doc.rst
@@ -0,0 +1,51 @@
+Template-Free Element Proposal (Pixels to Elements)
+===================================================
+
+Set-of-Marks, ``observation`` and the grounding helpers all assume you already
+have a list of element boxes — but on a screen the framework doesn't model
+(a game, a custom-drawn app, a remote desktop) there is no accessibility tree to
+provide one. ``element_proposal`` builds that top-of-funnel list from pixels:
+detect candidate *widget* boxes (closed-edge blobs) and *text* boxes
+(:func:`text_regions.find_text_regions`), fuse them — dropping widget boxes that
+are really just text — and return them in reading order, each tagged ``text`` or
+``widget``.
+
+* :func:`propose_elements` — the full pixel-to-elements pipeline.
+* :func:`tag_kinds` — pure: label fused boxes ``text`` / ``widget`` by source and
+  keep their reading-order ``index``.
+
+The fusion / cross-check / ordering reuse :mod:`element_parse` — the ``ocr`` >
+``icon`` source priority *is* the "drop widget-that-is-really-text" check — and
+the text detection reuses :mod:`text_regions`. ``cv2`` is imported lazily so the
+module stays importable; :func:`tag_kinds` is pure and fully testable. Imports no
+``PySide6``.
+
+Headless API
+------------
+
+.. code-block:: python
+
+    from je_auto_control import propose_elements, mark_elements
+
+    # No accessibility tree? Propose elements straight from the screen:
+    elements = propose_elements(min_area=120)
+    # [{'box': [x, y, w, h], 'kind': 'widget', 'index': 0}, ...]
+
+    # Feed them to Set-of-Marks like any other element list:
+    marks = mark_elements(elements)
+
+``propose_elements`` returns ``[{box, kind, index}]`` in reading order, where
+``kind`` is ``text`` or ``widget``. It is the missing top-of-funnel for the
+agent stack on un-modelled UIs: pixels in, a clean numbered element list out,
+ready for marking, observation or grounding. Tune ``min_area`` for the smallest
+control you care about and ``iou_threshold`` for how aggressively overlapping
+text and widget boxes are merged.
+
+Executor commands
+-----------------
+
+``AC_propose_elements`` (``region`` ``[x, y, w, h]`` / ``min_area`` /
+``iou_threshold`` → ``{elements}``) runs the full pipeline on the screen, and
+``AC_tag_kinds`` (``elements`` JSON list → ``{elements}``, pure) labels a
+pre-fused list. They are the matching read-only ``ac_*`` MCP tools and Script
+Builder commands under **Image**.
diff --git a/docs/source/Zh/doc/new_features/v220_features_doc.rst b/docs/source/Zh/doc/new_features/v220_features_doc.rst
new file mode 100644
index 00000000..45554aff
--- /dev/null
+++ b/docs/source/Zh/doc/new_features/v220_features_doc.rst
@@ -0,0 +1,42 @@
+免模板元素提案(像素到元素)
+============================
+
+Set-of-Marks、``observation`` 與 grounding 輔助函式都假設你已有一份元素方框清單——但在框架無法
+建模的畫面上(遊戲、自繪 app、遠端桌面),並沒有無障礙樹可提供。``element_proposal`` 從像素建立
+這份漏斗頂端清單:偵測候選*控制項*方框(封閉邊緣 blob)與*文字*方框
+(:func:`text_regions.find_text_regions`),將兩者融合——丟棄其實只是文字的控制項方框——
+並依閱讀順序回傳,每個標記為 ``text`` 或 ``widget``。
+
+* :func:`propose_elements` ——完整的像素到元素管線。
+* :func:`tag_kinds` ——純函式:依來源把融合後的方框標記 ``text`` / ``widget``,並保留其閱讀順序 ``index``。
+
+融合 / 交叉檢查 / 排序重用 :mod:`element_parse`——``ocr`` > ``icon`` 來源優先序*即*「丟棄其實是
+文字的控制項」檢查——文字偵測則重用 :mod:`text_regions`。``cv2`` 採延遲匯入,故模組仍可匯入;
+:func:`tag_kinds` 為純函式且可完整測試。不匯入 ``PySide6``。
+
+無頭 API
+--------
+
+.. code-block:: python
+
+    from je_auto_control import propose_elements, mark_elements
+
+    # 沒有無障礙樹?直接從畫面提案元素:
+    elements = propose_elements(min_area=120)
+    # [{'box': [x, y, w, h], 'kind': 'widget', 'index': 0}, ...]
+
+    # 像任何元素清單一樣餵給 Set-of-Marks:
+    marks = mark_elements(elements)
+
+``propose_elements`` 依閱讀順序回傳 ``[{box, kind, index}]``,``kind`` 為 ``text`` 或 ``widget``。
+它是 agent 堆疊在未建模 UI 上缺少的漏斗頂端:像素進、乾淨的編號元素清單出,可供標記、observation
+或 grounding。以 ``min_area`` 調整你在意的最小控制項,以 ``iou_threshold`` 調整重疊文字與控制項
+方框合併的積極程度。
+
+執行器指令
+----------
+
+``AC_propose_elements``(``region`` ``[x, y, w, h]`` / ``min_area`` /
+``iou_threshold`` → ``{elements}``)在畫面上執行完整管線,``AC_tag_kinds``
+(``elements`` JSON 清單 → ``{elements}``,純函式)則標記預先融合的清單。皆以對應的唯讀
+``ac_*`` MCP 工具及 Script Builder 指令(位於 **Image** 分類下)形式提供。
diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py
index 54b3f178..1ccf1d6d 100644
--- a/je_auto_control/__init__.py
+++ b/je_auto_control/__init__.py
@@ -149,6 +149,8 @@
 from je_auto_control.utils.icon_classify import (
     box_features, classify_icon, classify_widget,
 )
+# Propose a clean element list from raw pixels (template-free)
+from je_auto_control.utils.element_proposal import propose_elements, tag_kinds
 # Rich clipboard formats — RTF + CSV/TSV codecs and Windows get / set
 from je_auto_control.utils.clipboard_rich_formats import (
     build_rtf, csv_to_rows, get_clipboard_csv, get_clipboard_rtf, rows_to_csv,
@@ -1779,6 +1781,7 @@ def start_autocontrol_gui(*args, **kwargs):
     "normalize_theme", "match_theme",
     "localize_changes", "rank_changes",
     "classify_widget", "box_features", "classify_icon",
+    "propose_elements", "tag_kinds",
     "build_rtf", "rtf_to_text", "rows_to_csv", "csv_to_rows",
     "set_clipboard_rtf", "get_clipboard_rtf",
     "set_clipboard_csv", "get_clipboard_csv",
diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py
index f642c163..50e2031b 100644
--- a/je_auto_control/gui/script_builder/command_schema.py
+++ b/je_auto_control/gui/script_builder/command_schema.py
@@ -4648,6 +4648,25 @@ def _add_work_queue_specs(specs: List[CommandSpec]) -> None:
         ),
         description="Classify the widget in an image box from its pixels.",
     ))
+    specs.append(CommandSpec(
+        "AC_propose_elements", "Image", "Propose Elements (template-free)",
+        fields=(
+            FieldSpec("region", FieldType.STRING, optional=True,
+                      placeholder="[x, y, w, h]"),
+            FieldSpec("min_area", FieldType.INT, optional=True, default=80),
+            FieldSpec("iou_threshold", FieldType.FLOAT, optional=True,
+                      default=0.5),
+        ),
+        description="Propose text/widget element boxes from raw screen pixels.",
+    ))
+    specs.append(CommandSpec(
+        "AC_tag_kinds", "Image", "Tag Element Kinds",
+        fields=(
+            FieldSpec("elements", FieldType.STRING,
+                      placeholder="JSON list of fused boxes"),
+        ),
+        description="Label fused element boxes text/widget by source.",
+    ))
     specs.append(CommandSpec(
         "AC_normalize_ext", "Shell", "Normalize Extension",
         fields=(
diff --git a/je_auto_control/utils/element_proposal/__init__.py b/je_auto_control/utils/element_proposal/__init__.py
new file mode 100644
index 00000000..103cb9f0
--- /dev/null
+++ b/je_auto_control/utils/element_proposal/__init__.py
@@ -0,0 +1,6 @@
+"""Propose a clean element list from raw pixels, with no template or model."""
+from je_auto_control.utils.element_proposal.element_proposal import (
+    propose_elements, tag_kinds,
+)
+
+__all__ = ["propose_elements", "tag_kinds"]
diff --git a/je_auto_control/utils/element_proposal/element_proposal.py b/je_auto_control/utils/element_proposal/element_proposal.py
new file mode 100644
index 00000000..3b08b274
--- /dev/null
+++ b/je_auto_control/utils/element_proposal/element_proposal.py
@@ -0,0 +1,80 @@
+"""Propose a clean element list from raw pixels, with no template or model.
+
+Set-of-Marks, ``observation`` and the grounding helpers all assume you already
+have a list of element boxes — but on a screen the framework doesn't model
+(a game, a custom-drawn app, a remote desktop) there is no accessibility tree to
+provide one. ``element_proposal`` builds that top-of-funnel list from pixels:
+detect candidate *widget* boxes (closed-edge blobs) and *text* boxes
+(:func:`text_regions.find_text_regions`), fuse them — dropping widget boxes that
+are really just text — and return them in reading order, each tagged ``text`` or
+``widget``.
+
+* :func:`propose_elements` — the full pixel-to-elements pipeline.
+* :func:`tag_kinds` — pure: label fused boxes ``text`` / ``widget`` by source and
+  keep their reading-order ``index``.
+
+The fusion / cross-check / ordering reuse :mod:`element_parse` (the ``ocr`` >
+``icon`` priority *is* the "drop widget-that-is-really-text" check) and
+:mod:`text_regions`; ``cv2`` is imported lazily so the module stays importable.
+:func:`tag_kinds` is pure and fully testable. Imports no ``PySide6``.
+"""
+from typing import Any, Dict, List, Optional, Sequence
+
+# Reading-order source tag to element kind.
+_KIND_BY_SOURCE = {"ocr": "text", "icon": "widget", "a11y": "element"}
+
+
+def tag_kinds(elements: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Label fused boxes ``text`` / ``widget`` by source (pure).
+
+    Each input box carries a ``source`` (``ocr`` / ``icon``) and an ``index``
+    from :func:`element_parse.reading_order`. Returns ``[{box, kind, index}]``.
+    """
+    result: List[Dict[str, Any]] = []
+    for element in elements:
+        box = [int(element["x"]), int(element["y"]),
+               int(element["width"]), int(element["height"])]
+        kind = _KIND_BY_SOURCE.get(element.get("source"), "widget")
+        result.append({"box": box, "kind": kind, "index": element.get("index")})
+    return result
+
+
+def _reasonable(box: Dict[str, Any], frame_w: int, frame_h: int) -> bool:
+    """Keep plausibly-widget blobs: not the whole frame, not a thin rule."""
+    width, height = int(box["width"]), int(box["height"])
+    if width >= 0.95 * frame_w and height >= 0.95 * frame_h:
+        return False
+    aspect = width / height if height else 0.0
+    return 0.05 <= aspect <= 15.0
+
+
+def _widget_boxes(gray: Any, min_area: int) -> List[Dict[str, Any]]:
+    """Detect candidate widget boxes as closed-edge blobs (cv2)."""
+    import cv2
+    from je_auto_control.utils.cv2_utils.blobs import connected_boxes
+    edges = cv2.Canny(gray, 50, 150)
+    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
+    closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel)
+    height, width = gray.shape[:2]
+    return [box for box in connected_boxes(closed, min_area=int(min_area))
+            if _reasonable(box, width, height)]
+
+
+def propose_elements(source: Optional[Any] = None, *,
+                     region: Optional[Sequence[int]] = None, min_area: int = 80,
+                     iou_threshold: float = 0.5) -> List[Dict[str, Any]]:
+    """Propose ``text`` / ``widget`` element boxes from pixels, in reading order.
+
+    Detects widget blobs and text regions on ``source`` (a fresh screen grab of
+    ``region`` by default), fuses them (overlapping text wins over widget), and
+    orders them. Returns ``[{box, kind, index}]``.
+    """
+    from je_auto_control.utils.element_parse import fuse_elements, reading_order
+    from je_auto_control.utils.text_regions import find_text_regions
+    from je_auto_control.utils.visual_match.visual_match import _haystack_gray
+    gray = _haystack_gray(source, region)
+    text = find_text_regions(gray, min_area=int(min_area))
+    widgets = _widget_boxes(gray, int(min_area))
+    fused = fuse_elements(ocr_boxes=text, icon_boxes=widgets,
+                          iou_threshold=float(iou_threshold))
+    return tag_kinds(reading_order(fused))
diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py
index 62168052..7f5a8761 100644
--- a/je_auto_control/utils/executor/action_executor.py
+++ b/je_auto_control/utils/executor/action_executor.py
@@ -2946,6 +2946,23 @@ def _classify_icon(source: Any, box: Any) -> Dict[str, Any]:
     return classify_icon(str(source), _coerce_list(box))
 
 
+def _propose_elements(region: Any = None, min_area: Any = 80,
+                      iou_threshold: Any = 0.5) -> Dict[str, Any]:
+    """Adapter: propose text/widget element boxes from pixels (device)."""
+    from je_auto_control.utils.element_proposal import propose_elements
+    elements = propose_elements(region=_coerce_region(region),
+                                min_area=int(min_area),
+                                iou_threshold=float(iou_threshold))
+    return {"elements": elements}
+
+
+def _tag_kinds(elements: Any) -> Dict[str, Any]:
+    """Adapter: label fused boxes text/widget by source (pure)."""
+    from je_auto_control.utils.element_proposal import tag_kinds
+    items = _coerce_list(elements) if elements else []
+    return {"elements": tag_kinds(items)}
+
+
 def _normalize_ext(target: str) -> Dict[str, Any]:
     """Adapter: the lowercased extension of a path / bare ext (pure)."""
     from je_auto_control.utils.file_assoc import normalize_ext
@@ -6989,6 +7006,8 @@ def __init__(self):
             "AC_localize_changes": _localize_changes,
             "AC_classify_widget": _classify_widget,
             "AC_classify_icon": _classify_icon,
+            "AC_propose_elements": _propose_elements,
+            "AC_tag_kinds": _tag_kinds,
             "AC_normalize_ext": _normalize_ext,
             "AC_file_association": _file_association,
             "AC_get_control_text": _get_control_text,
diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py
index 3b11e843..26c2427e 100644
--- a/je_auto_control/utils/mcp_server/tools/_factories.py
+++ b/je_auto_control/utils/mcp_server/tools/_factories.py
@@ -4159,6 +4159,31 @@ def img_histogram_tools() -> List[MCPTool]:
             handler=h.classify_icon,
             annotations=READ_ONLY,
         ),
+        MCPTool(
+            name="ac_propose_elements",
+            description=("Propose text/widget element boxes from raw screen "
+                         "pixels (template-free): detect widget blobs + text "
+                         "regions, fuse, order. 'region' [x,y,w,h] clips. "
+                         "Returns {elements:[{box, kind, index}]}."),
+            input_schema=schema({"region": {"type": "array",
+                                           "items": {"type": "integer"}},
+                                 "min_area": {"type": "integer"},
+                                 "iou_threshold": {"type": "number"}}),
+            handler=h.propose_elements,
+            annotations=READ_ONLY,
+        ),
+        MCPTool(
+            name="ac_tag_kinds",
+            description=("Label fused element boxes 'text'/'widget' by source "
+                         "(pure). 'elements' is a list of {x,y,width,height,"
+                         "source,index}. Returns {elements:[{box, kind, "
+                         "index}]}."),
+            input_schema=schema({"elements": {"type": "array",
+                                             "items": {"type": "object"}}},
+                                required=["elements"]),
+            handler=h.tag_kinds,
+            annotations=READ_ONLY,
+        ),
     ]
 
 
diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py
index 66875681..91d38a68 100644
--- a/je_auto_control/utils/mcp_server/tools/_handlers.py
+++ b/je_auto_control/utils/mcp_server/tools/_handlers.py
@@ -797,6 +797,18 @@ def classify_icon(source, box):
     return _classify_icon(source, box)
 
 
+def propose_elements(region=None, min_area=80, iou_threshold=0.5):
+    from je_auto_control.utils.executor.action_executor import (
+        _propose_elements,
+    )
+    return _propose_elements(region, min_area, iou_threshold)
+
+
+def tag_kinds(elements):
+    from je_auto_control.utils.executor.action_executor import _tag_kinds
+    return _tag_kinds(elements)
+
+
 def normalize_ext(target):
     from je_auto_control.utils.executor.action_executor import _normalize_ext
     return _normalize_ext(target)
diff --git a/test/unit_test/headless/test_element_proposal_batch.py b/test/unit_test/headless/test_element_proposal_batch.py
new file mode 100644
index 00000000..c88fe950
--- /dev/null
+++ b/test/unit_test/headless/test_element_proposal_batch.py
@@ -0,0 +1,85 @@
+"""Headless tests for element_proposal (pure tag_kinds + cv2 pipeline)."""
+import pytest
+
+import je_auto_control as ac
+from je_auto_control.utils.element_proposal import propose_elements, tag_kinds
+
+
+# --- pure tag_kinds -------------------------------------------------------
+
+def test_tag_kinds_labels_by_source():
+    fused = [
+        {"x": 0, "y": 0, "width": 30, "height": 12, "source": "ocr",
+         "index": 0},
+        {"x": 0, "y": 20, "width": 16, "height": 16, "source": "icon",
+         "index": 1},
+    ]
+    tagged = tag_kinds(fused)
+    assert tagged[0] == {"box": [0, 0, 30, 12], "kind": "text", "index": 0}
+    assert tagged[1] == {"box": [0, 20, 16, 16], "kind": "widget", "index": 1}
+
+
+def test_tag_kinds_unknown_source_is_widget():
+    tagged = tag_kinds([{"x": 1, "y": 2, "width": 3, "height": 4}])
+    assert tagged[0]["kind"] == "widget"
+    assert tagged[0]["box"] == [1, 2, 3, 4]
+
+
+def test_tag_kinds_empty():
+    assert tag_kinds([]) == []
+
+
+# --- cv2 propose_elements (per-function importorskip) ---------------------
+
+def test_propose_elements_finds_widgets():
+    np = pytest.importorskip("numpy")
+    cv2 = pytest.importorskip("cv2")
+    canvas = np.full((200, 240), 245, dtype="uint8")
+    # three distinct outlined "widgets"
+    cv2.rectangle(canvas, (20, 20), (90, 60), 0, 2)
+    cv2.rectangle(canvas, (130, 30), (210, 70), 0, 2)
+    cv2.rectangle(canvas, (40, 110), (200, 160), 0, 2)
+    elements = propose_elements(canvas, min_area=120)
+    assert len(elements) >= 2
+    # every element is well-formed and in reading order
+    for position, element in enumerate(elements):
+        assert set(element) == {"box", "kind", "index"}
+        assert element["index"] == position
+        assert element["kind"] in ("text", "widget")
+        assert len(element["box"]) == 4
+    # nothing spans the whole frame
+    assert all(not (e["box"][2] >= 228 and e["box"][3] >= 190)
+               for e in elements)
+
+
+def test_propose_elements_blank_screen_is_empty_or_small():
+    np = pytest.importorskip("numpy")
+    pytest.importorskip("cv2")
+    blank = np.full((120, 120), 255, dtype="uint8")
+    assert propose_elements(blank, min_area=200) == []
+
+
+# --- wiring (cv2-free) ----------------------------------------------------
+
+def test_executor_pure_tag_path():
+    from je_auto_control.utils.executor.action_executor import _tag_kinds
+    out = _tag_kinds('[{"x":0,"y":0,"width":10,"height":10,"source":"icon"}]')
+    assert out["elements"][0]["kind"] == "widget"
+
+
+def test_wiring():
+    known = set(ac.executor.known_commands())
+    assert {"AC_propose_elements", "AC_tag_kinds"} <= known
+    from je_auto_control.utils.mcp_server.tools import (
+        build_default_tool_registry,
+    )
+    names = {t.name for t in build_default_tool_registry()}
+    assert {"ac_propose_elements", "ac_tag_kinds"} <= names
+    from je_auto_control.gui.script_builder.command_schema import _build_specs
+    specs = {s.command for s in _build_specs()}
+    assert {"AC_propose_elements", "AC_tag_kinds"} <= specs
+
+
+def test_facade_exports():
+    for name in ("propose_elements", "tag_kinds"):
+        assert hasattr(ac, name) and name in ac.__all__