From 6c826a4976681e61bd4934425c190d7ce42eb242 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 26 Jun 2026 09:49:25 +0800 Subject: [PATCH 1/3] Add change_localize: attribute a screen change to the element boxes that changed Existing diffs return raw pixel regions or a11y-element diffs; the gap is 'given a frame diff and a list of element boxes, which of those changed?'. localize_changes diffs reference vs current and scores each element box by its mean per-pixel change; rank_changes is the pure ranker (changed when score >= threshold, sorted most-changed first). cv2/numpy lazy. --- WHATS_NEW.md | 6 ++ .../doc/new_features/v218_features_doc.rst | 50 ++++++++++++ .../Zh/doc/new_features/v218_features_doc.rst | 44 +++++++++++ je_auto_control/__init__.py | 3 + .../gui/script_builder/command_schema.py | 26 ++++++ .../utils/change_localize/__init__.py | 6 ++ .../utils/change_localize/change_localize.py | 74 +++++++++++++++++ .../utils/executor/action_executor.py | 22 ++++++ .../utils/mcp_server/tools/_factories.py | 30 +++++++ .../utils/mcp_server/tools/_handlers.py | 13 +++ .../headless/test_change_localize_batch.py | 79 +++++++++++++++++++ 11 files changed, 353 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v218_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v218_features_doc.rst create mode 100644 je_auto_control/utils/change_localize/__init__.py create mode 100644 je_auto_control/utils/change_localize/change_localize.py create mode 100644 test/unit_test/headless/test_change_localize_batch.py diff --git a/WHATS_NEW.md b/WHATS_NEW.md index 25fe4b24..3ebd77c4 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -2,6 +2,12 @@ ## What's new (2026-06-26) +### Localize a Change to the Elements That Changed + +Turn a raw screen diff into "element 3 changed" by scoring a list of element boxes. Full reference: [`docs/source/Eng/doc/new_features/v218_features_doc.rst`](docs/source/Eng/doc/new_features/v218_features_doc.rst). + +- **`localize_changes` / `rank_changes`** (`AC_localize_changes`, `AC_rank_changes`): existing diffs answer *where* pixels changed (`motion_regions`, `perceptual_diff`, `ssim_changed_regions` → raw pixel regions) or which *accessibility* elements differ (`element_diff`, needs metadata) — but not "given a frame diff **and a list of element boxes**, which of *those* changed?". `localize_changes` diffs a reference against the current screen and scores each supplied element box by its mean per-pixel change; `rank_changes` is the pure ranker that flags `changed` (score ≥ `threshold`) and sorts most-changed first. Pairs with `set_of_marks`/accessibility boxes to give a per-element "what changed" feedback signal after a click. cv2/numpy imported lazily; ranking is pure and fully testable. Fifth feature of the ROUND-15 perception lane. No `PySide6`. + ### Theme-Invariant Matching (Light Template, Dark Mode) Find a button captured in light mode even after the app switches to dark mode. Full reference: [`docs/source/Eng/doc/new_features/v217_features_doc.rst`](docs/source/Eng/doc/new_features/v217_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v218_features_doc.rst b/docs/source/Eng/doc/new_features/v218_features_doc.rst new file mode 100644 index 00000000..694ce0dd --- /dev/null +++ b/docs/source/Eng/doc/new_features/v218_features_doc.rst @@ -0,0 +1,50 @@ +Localize a Change to the Elements That Changed +============================================== + +The existing diffs answer "*where* did pixels change" (``motion_regions``, +``perceptual_diff``, ``ssim_changed_regions`` return raw pixel regions) or "which +*accessibility* elements differ" (``element_diff``, needs a11y metadata). The +missing middle is: given a frame diff **and a list of element boxes**, which of +*those* elements changed? ``change_localize`` scores each supplied box by how +much it changed and ranks them. + +* :func:`rank_changes` — pure: take ``[{box, score}]`` and mark each box + ``changed`` (score at or above ``threshold``), sorted most-changed first. +* :func:`localize_changes` — diff a reference against the current screen, score + each element box by its mean pixel change, and rank them. + +``cv2`` / ``numpy`` are imported lazily (the module stays importable without +them) and the loaders reuse :mod:`visual_match`. The ranking is pure and fully +testable. Imports no ``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import localize_changes, rank_changes, mark_elements + + boxes = [mark["bbox"] for mark in mark_elements(elements)] + + # After an action, which of those elements actually changed? + changed = localize_changes("before.png", boxes, current="after.png") + for entry in changed: + if entry["changed"]: + print("element changed:", entry["box"], entry["score"]) + + # Or rank pre-computed scores yourself: + rank_changes([{"box": [0, 0, 40, 20], "score": 0.6}], threshold=0.1) + +``localize_changes`` returns ``[{box, score, changed}]`` sorted most-changed +first, where ``score`` is the box's mean per-pixel change (0..1). It pairs with +``set_of_marks`` / accessibility element boxes to turn a raw screen diff into a +per-element "what changed" signal — an agent feedback channel after a click. + +Executor commands +----------------- + +``AC_localize_changes`` (``reference`` + ``boxes`` JSON list + ``current`` / +``threshold`` / ``region`` → ``{changes}``) and ``AC_rank_changes`` +(``scored_boxes`` JSON list + ``threshold`` → ``{changes}``, pure). They are the +matching read-only ``ac_*`` MCP tools and Script Builder commands under +**Image**. diff --git a/docs/source/Zh/doc/new_features/v218_features_doc.rst b/docs/source/Zh/doc/new_features/v218_features_doc.rst new file mode 100644 index 00000000..bf69978d --- /dev/null +++ b/docs/source/Zh/doc/new_features/v218_features_doc.rst @@ -0,0 +1,44 @@ +把變化歸因到實際改變的元素 +========================== + +既有的 diff 回答「像素在*哪裡*改變」(``motion_regions``、``perceptual_diff``、 +``ssim_changed_regions`` 回傳原始像素區域),或「哪些*無障礙*元素不同」(``element_diff``,需 a11y 中介資料)。 +缺少的中段是:給定一個畫面 diff **與一份元素方框清單**,*那些*元素中哪些改變了?``change_localize`` 依 +每個提供的方框改變多少評分並排序。 + +* :func:`rank_changes` ——純函式:接受 ``[{box, score}]`` 並把每個方框標記為 ``changed`` + (分數達到或超過 ``threshold``),依改變最多排在最前。 +* :func:`localize_changes` ——把參考影像對目前螢幕做 diff,依每個元素方框的平均像素改變評分,再排序。 + +``cv2`` / ``numpy`` 採延遲匯入(模組無需它們即可匯入),載入器重用 :mod:`visual_match`。 +排序為純函式且可完整測試。不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import localize_changes, rank_changes, mark_elements + + boxes = [mark["bbox"] for mark in mark_elements(elements)] + + # 某動作後,那些元素中哪些真的改變了? + changed = localize_changes("before.png", boxes, current="after.png") + for entry in changed: + if entry["changed"]: + print("元素改變:", entry["box"], entry["score"]) + + # 或自行排序預先算好的分數: + rank_changes([{"box": [0, 0, 40, 20], "score": 0.6}], threshold=0.1) + +``localize_changes`` 回傳 ``[{box, score, changed}]`` 依改變最多排序,``score`` 是方框的平均 +逐像素改變(0..1)。它與 ``set_of_marks`` / 無障礙元素方框搭配,把原始螢幕 diff 轉成逐元素的 +「什麼改變了」訊號——點擊後的 agent 回饋通道。 + +執行器指令 +---------- + +``AC_localize_changes``(``reference`` 加上 ``boxes`` JSON 清單加上 ``current`` / +``threshold`` / ``region`` → ``{changes}``)與 ``AC_rank_changes``(``scored_boxes`` JSON 清單加上 +``threshold`` → ``{changes}``,純函式)。皆以對應的唯讀 ``ac_*`` MCP 工具及 Script Builder 指令 +(位於 **Image** 分類下)形式提供。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index bfb1a716..8686170d 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -143,6 +143,8 @@ ) # Theme-invariant matching so a light template matches dark mode from je_auto_control.utils.theme_normalize import match_theme, normalize_theme +# Attribute a screen change to the specific element boxes that changed +from je_auto_control.utils.change_localize import localize_changes, rank_changes # Rich clipboard formats — RTF + CSV/TSV codecs and Windows get / set from je_auto_control.utils.clipboard_rich_formats import ( build_rtf, csv_to_rows, get_clipboard_csv, get_clipboard_rtf, rows_to_csv, @@ -1771,6 +1773,7 @@ def start_autocontrol_gui(*args, **kwargs): "place_labels", "label_color", "grade_contrast", "dominant_pair", "region_contrast", "normalize_theme", "match_theme", + "localize_changes", "rank_changes", "build_rtf", "rtf_to_text", "rows_to_csv", "csv_to_rows", "set_clipboard_rtf", "get_clipboard_rtf", "set_clipboard_csv", "get_clipboard_csv", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 1d1f6f0f..ac0cf971 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -4606,6 +4606,32 @@ def _add_work_queue_specs(specs: List[CommandSpec]) -> None: ), description="Locate a template across a light/dark theme flip.", )) + specs.append(CommandSpec( + "AC_rank_changes", "Image", "Rank Changed Boxes", + fields=( + FieldSpec("scored_boxes", FieldType.STRING, + placeholder="JSON list of {box, score}"), + FieldSpec("threshold", FieldType.FLOAT, optional=True, + default=0.1), + ), + description="Rank scored element boxes by how much they changed.", + )) + specs.append(CommandSpec( + "AC_localize_changes", "Image", "Localize Changed Elements", + fields=( + FieldSpec("reference", FieldType.STRING, + placeholder="reference image path"), + FieldSpec("boxes", FieldType.STRING, + placeholder="JSON list of [x, y, w, h]"), + FieldSpec("current", FieldType.STRING, optional=True, + placeholder="current image path (else screen)"), + FieldSpec("threshold", FieldType.FLOAT, optional=True, + default=0.1), + FieldSpec("region", FieldType.STRING, optional=True, + placeholder="[x, y, w, h]"), + ), + description="Rank which element boxes changed between two frames.", + )) specs.append(CommandSpec( "AC_normalize_ext", "Shell", "Normalize Extension", fields=( diff --git a/je_auto_control/utils/change_localize/__init__.py b/je_auto_control/utils/change_localize/__init__.py new file mode 100644 index 00000000..c11b398b --- /dev/null +++ b/je_auto_control/utils/change_localize/__init__.py @@ -0,0 +1,6 @@ +"""Attribute a screen change to the specific element boxes that changed.""" +from je_auto_control.utils.change_localize.change_localize import ( + localize_changes, rank_changes, +) + +__all__ = ["localize_changes", "rank_changes"] diff --git a/je_auto_control/utils/change_localize/change_localize.py b/je_auto_control/utils/change_localize/change_localize.py new file mode 100644 index 00000000..efb38cc6 --- /dev/null +++ b/je_auto_control/utils/change_localize/change_localize.py @@ -0,0 +1,74 @@ +"""Attribute a screen change to the specific elements that changed. + +The existing diffs answer "*where* did pixels change" (``motion_regions``, +``perceptual_diff``, ``ssim_changed_regions`` return raw pixel regions) or "which +*accessibility* elements differ" (``element_diff``, needs a11y metadata). The +missing middle is: given a frame diff **and a list of element boxes**, which of +*those* elements changed? ``change_localize`` scores each supplied box by how +much it changed and ranks them. + +* :func:`rank_changes` — pure: take ``[{box, score}]`` and mark each box + ``changed`` (score at or above ``threshold``), sorted most-changed first. +* :func:`localize_changes` — diff a reference against the current screen, score + each element box by its mean pixel change, and rank them. + +cv2 / numpy are imported lazily (the module stays importable without them) and +the loaders reuse :mod:`visual_match`. The ranking is pure and fully testable. +Imports no ``PySide6``. +""" +from typing import Any, Dict, List, Optional, Sequence + + +def _unpack(item: Any) -> tuple: + """Return ``(box, score)`` from a ``{box, score}`` dict or a ``(box, score)``.""" + if isinstance(item, dict): + return item["box"], item["score"] + return item[0], item[1] + + +def rank_changes(scored_boxes: Sequence[Any], *, + threshold: float = 0.1) -> List[Dict[str, Any]]: + """Mark and rank scored element boxes by how much they changed (pure). + + ``scored_boxes`` is a sequence of ``{box, score}`` (or ``(box, score)``). + Returns ``[{box, score, changed}]`` sorted by descending score; ``changed`` + is ``True`` when the score is at or above ``threshold``. + """ + limit = float(threshold) + result = [ + {"box": [int(value) for value in box], + "score": round(float(score), 4), + "changed": float(score) >= limit} + for box, score in (_unpack(item) for item in scored_boxes) + ] + result.sort(key=lambda entry: entry["score"], reverse=True) + return result + + +def _box_mean(diff: Any, box: Sequence[int]) -> float: + """Mean change (0..1) of the diff map inside ``box`` (numpy).""" + x, y, w, h = (int(box[0]), int(box[1]), int(box[2]), int(box[3])) + patch = diff[max(0, y):y + h, max(0, x):x + w] + return float(patch.mean()) if patch.size else 0.0 + + +def localize_changes(reference: Any, boxes: Sequence[Sequence[int]], *, + current: Optional[Any] = None, threshold: float = 0.1, + region: Optional[Sequence[int]] = None + ) -> List[Dict[str, Any]]: + """Score and rank which of ``boxes`` changed between two frames. + + Diffs ``reference`` against ``current`` (a fresh screen grab of ``region`` + by default), takes each box's mean per-pixel change (0..1), and ranks them + via :func:`rank_changes`. Returns ``[{box, score, changed}]``. + """ + import numpy as np + from je_auto_control.utils.visual_match.visual_match import ( + _grab_gray, _to_gray) + ref = _to_gray(reference).astype("float64") + other = current if current is not None else _grab_gray(region) + cur = _to_gray(other).astype("float64") + diff = np.abs(ref - cur) / 255.0 + scored = [{"box": list(box), "score": _box_mean(diff, box)} + for box in boxes] + return rank_changes(scored, threshold=threshold) diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 6ba236f4..fc271047 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2912,6 +2912,26 @@ def _match_theme(template: Any, region: Any = None, method: Any = "sobel", return {"found": True, **match} +def _rank_changes(scored_boxes: Any, threshold: Any = 0.1) -> Dict[str, Any]: + """Adapter: rank scored element boxes by how much they changed (pure).""" + from je_auto_control.utils.change_localize import rank_changes + items = _coerce_list(scored_boxes) if scored_boxes else [] + return {"changes": rank_changes(items, threshold=float(threshold))} + + +def _localize_changes(reference: Any, boxes: Any, current: Any = None, + threshold: Any = 0.1, region: Any = None + ) -> Dict[str, Any]: + """Adapter: rank which element boxes changed between two frames (device).""" + from je_auto_control.utils.change_localize import localize_changes + box_list = _coerce_list(boxes) if boxes else [] + changes = localize_changes(str(reference), box_list, + current=str(current) if current else None, + threshold=float(threshold), + region=_coerce_region(region)) + return {"changes": changes} + + def _normalize_ext(target: str) -> Dict[str, Any]: """Adapter: the lowercased extension of a path / bare ext (pure).""" from je_auto_control.utils.file_assoc import normalize_ext @@ -6951,6 +6971,8 @@ def __init__(self): "AC_dominant_pair": _dominant_pair, "AC_region_contrast": _region_contrast, "AC_match_theme": _match_theme, + "AC_rank_changes": _rank_changes, + "AC_localize_changes": _localize_changes, "AC_normalize_ext": _normalize_ext, "AC_file_association": _file_association, "AC_get_control_text": _get_control_text, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 4369522b..41e8400d 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -4107,6 +4107,36 @@ def img_histogram_tools() -> List[MCPTool]: handler=h.match_theme, annotations=READ_ONLY, ), + MCPTool( + name="ac_rank_changes", + description=("Rank scored element boxes by how much they changed. " + "'scored_boxes' is a list of {box:[x,y,w,h], score}. " + "Pure. Returns {changes:[{box, score, changed}]} " + "sorted most-changed first."), + input_schema=schema({"scored_boxes": {"type": "array", + "items": {"type": "object"}}, + "threshold": {"type": "number"}}, + required=["scored_boxes"]), + handler=h.rank_changes, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_localize_changes", + description=("Which of the supplied element 'boxes' changed between " + "a 'reference' image and the current screen (or " + "'current' image). Returns {changes:[{box, score, " + "changed}]}."), + input_schema=schema({"reference": {"type": "string"}, + "boxes": {"type": "array", + "items": {"type": "array"}}, + "current": {"type": "string"}, + "threshold": {"type": "number"}, + "region": {"type": "array", + "items": {"type": "integer"}}}, + required=["reference", "boxes"]), + handler=h.localize_changes, + annotations=READ_ONLY, + ), ] diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 5006b182..a91d0644 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -774,6 +774,19 @@ def match_theme(template, region=None, method="sobel", min_score=0.5): return _match_theme(template, region, method, min_score) +def rank_changes(scored_boxes, threshold=0.1): + from je_auto_control.utils.executor.action_executor import _rank_changes + return _rank_changes(scored_boxes, threshold) + + +def localize_changes(reference, boxes, current=None, threshold=0.1, + region=None): + from je_auto_control.utils.executor.action_executor import ( + _localize_changes, + ) + return _localize_changes(reference, boxes, current, threshold, region) + + def normalize_ext(target): from je_auto_control.utils.executor.action_executor import _normalize_ext return _normalize_ext(target) diff --git a/test/unit_test/headless/test_change_localize_batch.py b/test/unit_test/headless/test_change_localize_batch.py new file mode 100644 index 00000000..84110e53 --- /dev/null +++ b/test/unit_test/headless/test_change_localize_batch.py @@ -0,0 +1,79 @@ +"""Headless tests for change_localize (pure ranking + cv2 localization).""" +import pytest + +import je_auto_control as ac +from je_auto_control.utils.change_localize import localize_changes, rank_changes + + +# --- pure rank_changes ---------------------------------------------------- + +def test_rank_changes_marks_and_sorts(): + scored = [{"box": [0, 0, 10, 10], "score": 0.02}, + {"box": [20, 20, 10, 10], "score": 0.5}, + {"box": [40, 40, 10, 10], "score": 0.2}] + ranked = rank_changes(scored, threshold=0.1) + # sorted most-changed first + assert [entry["score"] for entry in ranked] == pytest.approx([0.5, 0.2, + 0.02]) + assert [entry["changed"] for entry in ranked] == [True, True, False] + + +def test_rank_changes_accepts_tuples(): + ranked = rank_changes([([0, 0, 5, 5], 0.3), ([1, 1, 5, 5], 0.05)], + threshold=0.1) + assert ranked[0]["changed"] is True + assert ranked[1]["changed"] is False + + +def test_rank_changes_empty(): + assert rank_changes([]) == [] + + +def test_rank_changes_threshold_boundary(): + # a score exactly at the threshold counts as changed (>=) + ranked = rank_changes([{"box": [0, 0, 1, 1], "score": 0.1}], threshold=0.1) + assert ranked[0]["changed"] is True + + +# --- cv2 localize_changes (per-function importorskip) --------------------- + +def test_localize_changes_attributes_to_the_right_box(): + np = pytest.importorskip("numpy") + pytest.importorskip("cv2") + reference = np.zeros((100, 100), dtype="uint8") + current = reference.copy() + current[40:60, 40:60] = 255 # change inside this box only + boxes = [[40, 40, 20, 20], [0, 0, 20, 20]] + ranked = localize_changes(reference, boxes, current=current, + threshold=0.05) + # the changed box ranks first and is flagged; the untouched one is not + assert ranked[0]["box"] == [40, 40, 20, 20] + assert ranked[0]["changed"] is True + untouched = [r for r in ranked if r["box"] == [0, 0, 20, 20]][0] + assert untouched["changed"] is False + + +# --- wiring (cv2-free) ---------------------------------------------------- + +def test_executor_pure_rank_path(): + from je_auto_control.utils.executor.action_executor import _rank_changes + out = _rank_changes('[{"box": [0,0,4,4], "score": 0.4}]', 0.1) + assert out["changes"][0]["changed"] is True + + +def test_wiring(): + known = set(ac.executor.known_commands()) + assert {"AC_rank_changes", "AC_localize_changes"} <= known + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry, + ) + names = {t.name for t in build_default_tool_registry()} + assert {"ac_rank_changes", "ac_localize_changes"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_rank_changes", "AC_localize_changes"} <= specs + + +def test_facade_exports(): + for name in ("localize_changes", "rank_changes"): + assert hasattr(ac, name) and name in ac.__all__ From a802ab6f80de7cad71b35c02d9911366b5f38c11 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 26 Jun 2026 10:06:43 +0800 Subject: [PATCH 2/3] Add icon_classify: classify a widget from its pixel shape Set-of-Marks/element proposers return boxes but not what each box is; form_fields.checkbox_state reads a box already known to be a checkbox. box_features extracts {aspect, fill, edge_density, circularity}; classify_widget is the pure heuristic classifier (round->radio, wide-rounded->toggle, square-sparse->checkbox, wide-hollow->text_field, wide-filled->button, else icon); classify_icon composes them. cv2 lazy. --- WHATS_NEW.md | 6 + .../doc/new_features/v219_features_doc.rst | 46 ++++++++ .../Zh/doc/new_features/v219_features_doc.rst | 38 +++++++ je_auto_control/__init__.py | 5 + .../gui/script_builder/command_schema.py | 16 +++ .../utils/executor/action_executor.py | 16 +++ .../utils/icon_classify/__init__.py | 6 + .../utils/icon_classify/icon_classify.py | 107 ++++++++++++++++++ .../utils/mcp_server/tools/_factories.py | 22 ++++ .../utils/mcp_server/tools/_handlers.py | 10 ++ .../headless/test_icon_classify_batch.py | 107 ++++++++++++++++++ 11 files changed, 379 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v219_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v219_features_doc.rst create mode 100644 je_auto_control/utils/icon_classify/__init__.py create mode 100644 je_auto_control/utils/icon_classify/icon_classify.py create mode 100644 test/unit_test/headless/test_icon_classify_batch.py diff --git a/WHATS_NEW.md b/WHATS_NEW.md index 3ebd77c4..83c0dbe3 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -2,6 +2,12 @@ ## What's new (2026-06-26) +### Classify a Widget from Its Pixel Shape + +Tell a checkbox from a radio button from a text field — from pixels, no model. Full reference: [`docs/source/Eng/doc/new_features/v219_features_doc.rst`](docs/source/Eng/doc/new_features/v219_features_doc.rst). + +- **`classify_widget` / `box_features` / `classify_icon`** (`AC_classify_widget`, `AC_classify_icon`): Set-of-Marks and element proposers return *boxes* but not *what each box is*; `form_fields.checkbox_state` reads a box already known to be a checkbox — the gap is the typing step before it. `box_features` extracts `{aspect, fill, edge_density, circularity}` for a box; `classify_widget` is the pure heuristic classifier (round→radio, wide-rounded→toggle, square-sparse→checkbox, wide-hollow→text_field, wide-filled→button, else icon); `classify_icon` composes them. The classifier is pure and fully testable; cv2/numpy imported lazily so the module stays importable. Sixth feature of the ROUND-15 perception lane. No `PySide6`. + ### Localize a Change to the Elements That Changed Turn a raw screen diff into "element 3 changed" by scoring a list of element boxes. Full reference: [`docs/source/Eng/doc/new_features/v218_features_doc.rst`](docs/source/Eng/doc/new_features/v218_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v219_features_doc.rst b/docs/source/Eng/doc/new_features/v219_features_doc.rst new file mode 100644 index 00000000..8e595dde --- /dev/null +++ b/docs/source/Eng/doc/new_features/v219_features_doc.rst @@ -0,0 +1,46 @@ +Classify a Widget from Its Pixel Shape +====================================== + +Set-of-Marks and element proposers hand back *boxes*, but not *what each box is*. +``form_fields.checkbox_state`` already reads a box known to be a checkbox; the +gap is the typing step before it — is this box a checkbox, a radio button, a push +button, a text field or a toggle? ``icon_classify`` answers that from cheap +geometric features (no model). + +* :func:`box_features` — extract ``{aspect, fill, edge_density, circularity}`` + for a box region (the objective measurements). +* :func:`classify_widget` — pure: map a feature dict to a widget type by + documented heuristics. +* :func:`classify_icon` — compose the two: a box to ``{type, features}``. + +``classify_widget`` is pure and fully testable; ``box_features`` imports cv2 / +numpy lazily (the module stays importable without them) and reuses +:func:`visual_match._to_gray`. Imports no ``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import classify_icon, classify_widget + + # From a screenshot + a box: + classify_icon("dialog.png", [120, 80, 16, 16]) + # {'type': 'checkbox', 'features': {'aspect': 1.0, 'fill': 0.12, ...}} + + # From features you already have: + classify_widget({"aspect": 1.0, "circularity": 0.9, "fill": 0.4}) # 'radio' + +The heuristics: a round box (aspect ≈ 1, high circularity) is a ``radio``; a wide +rounded box is a ``toggle``; a near-square sparse box is a ``checkbox``; a wide +hollow box is a ``text_field``; a wide filled box is a ``button``; anything else +is an ``icon``. Tune by reading ``features`` and applying your own rules where +the defaults misfire — the measurements are the durable part. + +Executor commands +----------------- + +``AC_classify_widget`` (``features`` JSON object → ``{type}``, pure) and +``AC_classify_icon`` (``source`` image + ``box`` ``[x, y, w, h]`` → +``{type, features}``). They are the matching read-only ``ac_*`` MCP tools and +Script Builder commands under **Image**. diff --git a/docs/source/Zh/doc/new_features/v219_features_doc.rst b/docs/source/Zh/doc/new_features/v219_features_doc.rst new file mode 100644 index 00000000..00cfaf05 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v219_features_doc.rst @@ -0,0 +1,38 @@ +從像素形狀分類控制項 +==================== + +Set-of-Marks 與元素提案器回傳*方框*,卻不告訴你*每個方框是什麼*。``form_fields.checkbox_state`` +已能讀取一個已知是核取方塊的方框;缺少的是它之前的分類步驟——這個方框是核取方塊、單選鈕、按鈕、 +文字欄位還是切換開關?``icon_classify`` 從低成本的幾何特徵(無需模型)回答此問題。 + +* :func:`box_features` ——擷取方框區域的 ``{aspect, fill, edge_density, circularity}``(客觀量測)。 +* :func:`classify_widget` ——純函式:以記載的啟發式規則把特徵字典映射為控制項型別。 +* :func:`classify_icon` ——組合兩者:把一個方框轉為 ``{type, features}``。 + +``classify_widget`` 為純函式且可完整測試;``box_features`` 延遲匯入 cv2 / numpy(模組無需它們即可匯入), +並重用 :func:`visual_match._to_gray`。不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import classify_icon, classify_widget + + # 從截圖 + 方框: + classify_icon("dialog.png", [120, 80, 16, 16]) + # {'type': 'checkbox', 'features': {'aspect': 1.0, 'fill': 0.12, ...}} + + # 從你已有的特徵: + classify_widget({"aspect": 1.0, "circularity": 0.9, "fill": 0.4}) # 'radio' + +啟發式規則:圓形方框(aspect ≈ 1、高 circularity)為 ``radio``;寬且圓潤為 ``toggle``; +近正方且稀疏為 ``checkbox``;寬且空心為 ``text_field``;寬且填滿為 ``button``;其餘為 ``icon``。 +在預設誤判處,可讀取 ``features`` 套用你自己的規則微調——量測值才是耐用的部分。 + +執行器指令 +---------- + +``AC_classify_widget``(``features`` JSON 物件 → ``{type}``,純函式)與 +``AC_classify_icon``(``source`` 影像 + ``box`` ``[x, y, w, h]`` → ``{type, features}``)。 +皆以對應的唯讀 ``ac_*`` MCP 工具及 Script Builder 指令(位於 **Image** 分類下)形式提供。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 8686170d..54b3f178 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -145,6 +145,10 @@ from je_auto_control.utils.theme_normalize import match_theme, normalize_theme # Attribute a screen change to the specific element boxes that changed from je_auto_control.utils.change_localize import localize_changes, rank_changes +# Classify what kind of widget a box is from its pixel shape +from je_auto_control.utils.icon_classify import ( + box_features, classify_icon, classify_widget, +) # Rich clipboard formats — RTF + CSV/TSV codecs and Windows get / set from je_auto_control.utils.clipboard_rich_formats import ( build_rtf, csv_to_rows, get_clipboard_csv, get_clipboard_rtf, rows_to_csv, @@ -1774,6 +1778,7 @@ def start_autocontrol_gui(*args, **kwargs): "grade_contrast", "dominant_pair", "region_contrast", "normalize_theme", "match_theme", "localize_changes", "rank_changes", + "classify_widget", "box_features", "classify_icon", "build_rtf", "rtf_to_text", "rows_to_csv", "csv_to_rows", "set_clipboard_rtf", "get_clipboard_rtf", "set_clipboard_csv", "get_clipboard_csv", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index ac0cf971..f642c163 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -4632,6 +4632,22 @@ def _add_work_queue_specs(specs: List[CommandSpec]) -> None: ), description="Rank which element boxes changed between two frames.", )) + specs.append(CommandSpec( + "AC_classify_widget", "Image", "Classify Widget (features)", + fields=( + FieldSpec("features", FieldType.STRING, + placeholder="JSON {aspect, circularity, fill}"), + ), + description="Map geometric features to a widget type.", + )) + specs.append(CommandSpec( + "AC_classify_icon", "Image", "Classify Icon (box)", + fields=( + FieldSpec("source", FieldType.STRING, placeholder="image path"), + FieldSpec("box", FieldType.STRING, placeholder="[x, y, w, h]"), + ), + description="Classify the widget in an image box from its pixels.", + )) specs.append(CommandSpec( "AC_normalize_ext", "Shell", "Normalize Extension", fields=( diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index fc271047..62168052 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2932,6 +2932,20 @@ def _localize_changes(reference: Any, boxes: Any, current: Any = None, return {"changes": changes} +def _classify_widget(features: Any) -> Dict[str, Any]: + """Adapter: map geometric features to a widget type (pure).""" + from je_auto_control.utils.icon_classify import classify_widget + import json + data = json.loads(features) if isinstance(features, str) else dict(features) + return {"type": classify_widget(data)} + + +def _classify_icon(source: Any, box: Any) -> Dict[str, Any]: + """Adapter: classify the widget in a box from its pixels (device).""" + from je_auto_control.utils.icon_classify import classify_icon + return classify_icon(str(source), _coerce_list(box)) + + def _normalize_ext(target: str) -> Dict[str, Any]: """Adapter: the lowercased extension of a path / bare ext (pure).""" from je_auto_control.utils.file_assoc import normalize_ext @@ -6973,6 +6987,8 @@ def __init__(self): "AC_match_theme": _match_theme, "AC_rank_changes": _rank_changes, "AC_localize_changes": _localize_changes, + "AC_classify_widget": _classify_widget, + "AC_classify_icon": _classify_icon, "AC_normalize_ext": _normalize_ext, "AC_file_association": _file_association, "AC_get_control_text": _get_control_text, diff --git a/je_auto_control/utils/icon_classify/__init__.py b/je_auto_control/utils/icon_classify/__init__.py new file mode 100644 index 00000000..f7c35564 --- /dev/null +++ b/je_auto_control/utils/icon_classify/__init__.py @@ -0,0 +1,6 @@ +"""Classify what kind of widget a box is from its pixel shape.""" +from je_auto_control.utils.icon_classify.icon_classify import ( + WIDGET_TYPES, box_features, classify_icon, classify_widget, +) + +__all__ = ["classify_widget", "box_features", "classify_icon", "WIDGET_TYPES"] diff --git a/je_auto_control/utils/icon_classify/icon_classify.py b/je_auto_control/utils/icon_classify/icon_classify.py new file mode 100644 index 00000000..edd11c23 --- /dev/null +++ b/je_auto_control/utils/icon_classify/icon_classify.py @@ -0,0 +1,107 @@ +"""Classify what kind of widget a box is from its pixel shape. + +Set-of-Marks and element proposers hand back *boxes*, but not *what each box is*. +``form_fields.checkbox_state`` already reads a box known to be a checkbox; the +gap is the typing step before it — is this box a checkbox, a radio button, a +push button, a text field or a toggle? ``icon_classify`` answers that from cheap +geometric features (no model): + +* :func:`box_features` — extract ``{aspect, fill, edge_density, circularity}`` + for a box region (the objective measurements). +* :func:`classify_widget` — pure: map a feature dict to a widget type by + documented heuristics. +* :func:`classify_icon` — compose the two: a box to ``{type, features}``. + +``classify_widget`` is pure and fully testable; ``box_features`` imports cv2 / +numpy lazily (the module stays importable without them) and reuses +:func:`visual_match._to_gray`. Imports no ``PySide6``. +""" +from typing import Any, Dict, Sequence + +# The widget types this classifier can return. +WIDGET_TYPES = ("radio", "toggle", "checkbox", "text_field", "button", "icon") + + +def _is_round(aspect: float, circ: float) -> bool: + """Near-square and circular (a radio button / round dot).""" + return 0.7 <= aspect <= 1.4 and circ >= 0.7 + + +def _is_pill(aspect: float, circ: float) -> bool: + """Wide and rounded (a toggle switch).""" + return 1.8 <= aspect <= 3.5 and circ >= 0.55 + + +def classify_widget(features: Dict[str, float]) -> str: + """Map geometric ``features`` to a widget type by heuristics (pure). + + Uses ``aspect`` (w/h), ``circularity`` (1 = circle), and ``fill`` (ink + fraction). Round → ``radio``; wide & rounded → ``toggle``; near-square & + sparse → ``checkbox``; wide & hollow → ``text_field``; wide & filled → + ``button``; otherwise ``icon``. + """ + aspect = float(features.get("aspect", 1.0)) + circ = float(features.get("circularity", 0.0)) + fill = float(features.get("fill", 0.0)) + if _is_round(aspect, circ): + return "radio" + if _is_pill(aspect, circ): + return "toggle" + if 0.7 <= aspect <= 1.4 and fill <= 0.6: + return "checkbox" + if aspect >= 2.5 and fill <= 0.2: + return "text_field" + if aspect >= 1.5 and fill >= 0.2: + return "button" + return "icon" + + +def _circularity(binary: Any) -> float: + """Circularity (``4*pi*A / P^2``, 1 = circle) of the largest blob.""" + import math + + import cv2 + contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_SIMPLE) + if not contours: + return 0.0 + largest = max(contours, key=cv2.contourArea) + area = float(cv2.contourArea(largest)) + perimeter = float(cv2.arcLength(largest, True)) + if perimeter <= 0.0: + return 0.0 + return min(1.0, 4.0 * math.pi * area / (perimeter * perimeter)) + + +def box_features(source: Any, box: Sequence[int]) -> Dict[str, float]: + """Extract ``{aspect, fill, edge_density, circularity}`` for a box (cv2). + + ``aspect`` is width/height, ``fill`` the ink fraction (Otsu foreground), + ``edge_density`` the Canny-edge fraction, ``circularity`` the largest blob's + roundness. An empty box yields all zeros. + """ + import cv2 + from je_auto_control.utils.visual_match.visual_match import _to_gray + gray = _to_gray(source) + x, y, w, h = (int(box[0]), int(box[1]), int(box[2]), int(box[3])) + patch = gray[max(0, y):y + h, max(0, x):x + w] + if patch.size == 0: + return {"aspect": 0.0, "fill": 0.0, "edge_density": 0.0, + "circularity": 0.0} + _, binary = cv2.threshold(patch, 0, 255, + cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) + fill = float((binary > 0).sum()) / patch.size + edges = cv2.Canny(patch, 50, 150) + edge_density = float((edges > 0).sum()) / patch.size + return { + "aspect": round(w / h, 3) if h else 0.0, + "fill": round(fill, 3), + "edge_density": round(edge_density, 3), + "circularity": round(_circularity(binary), 3), + } + + +def classify_icon(source: Any, box: Sequence[int]) -> Dict[str, Any]: + """Classify the widget in a box from its pixels: ``{type, features}``.""" + features = box_features(source, box) + return {"type": classify_widget(features), "features": features} diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 41e8400d..3b11e843 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -4137,6 +4137,28 @@ def img_histogram_tools() -> List[MCPTool]: handler=h.localize_changes, annotations=READ_ONLY, ), + MCPTool( + name="ac_classify_widget", + description=("Map geometric 'features' {aspect, circularity, fill} " + "to a widget type (radio/toggle/checkbox/text_field/" + "button/icon). Pure. Returns {type}."), + input_schema=schema({"features": {"type": "object"}}, + required=["features"]), + handler=h.classify_widget, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_classify_icon", + description=("Classify the widget in a 'box' [x,y,w,h] of a " + "'source' image from its pixel shape. Returns {type, " + "features}."), + input_schema=schema({"source": {"type": "string"}, + "box": {"type": "array", + "items": {"type": "integer"}}}, + required=["source", "box"]), + handler=h.classify_icon, + annotations=READ_ONLY, + ), ] diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index a91d0644..66875681 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -787,6 +787,16 @@ def localize_changes(reference, boxes, current=None, threshold=0.1, return _localize_changes(reference, boxes, current, threshold, region) +def classify_widget(features): + from je_auto_control.utils.executor.action_executor import _classify_widget + return _classify_widget(features) + + +def classify_icon(source, box): + from je_auto_control.utils.executor.action_executor import _classify_icon + return _classify_icon(source, box) + + def normalize_ext(target): from je_auto_control.utils.executor.action_executor import _normalize_ext return _normalize_ext(target) diff --git a/test/unit_test/headless/test_icon_classify_batch.py b/test/unit_test/headless/test_icon_classify_batch.py new file mode 100644 index 00000000..63f65444 --- /dev/null +++ b/test/unit_test/headless/test_icon_classify_batch.py @@ -0,0 +1,107 @@ +"""Headless tests for icon_classify (pure classifier + cv2 features).""" +import pytest + +import je_auto_control as ac +from je_auto_control.utils.icon_classify import ( + box_features, classify_icon, classify_widget, +) + + +# --- pure classify_widget ------------------------------------------------- + +def test_classify_radio_round(): + assert classify_widget( + {"aspect": 1.0, "circularity": 0.92, "fill": 0.4}) == "radio" + + +def test_classify_toggle_wide_rounded(): + assert classify_widget( + {"aspect": 2.4, "circularity": 0.6, "fill": 0.5}) == "toggle" + + +def test_classify_checkbox_square_sparse(): + assert classify_widget( + {"aspect": 1.0, "circularity": 0.2, "fill": 0.1}) == "checkbox" + + +def test_classify_text_field_wide_hollow(): + assert classify_widget( + {"aspect": 4.0, "circularity": 0.1, "fill": 0.05}) == "text_field" + + +def test_classify_button_wide_filled(): + assert classify_widget( + {"aspect": 2.0, "circularity": 0.2, "fill": 0.5}) == "button" + + +def test_classify_icon_fallback(): + assert classify_widget( + {"aspect": 1.1, "circularity": 0.3, "fill": 0.9}) == "icon" + + +def test_classify_widget_defaults_dont_crash(): + assert classify_widget({}) in ("checkbox", "icon") + + +# --- cv2 box_features / classify_icon (per-function importorskip) ---------- + +def test_box_features_circle_rounder_than_square(): + np = pytest.importorskip("numpy") + cv2 = pytest.importorskip("cv2") + canvas = np.full((40, 40), 255, dtype="uint8") + cv2.circle(canvas, (20, 20), 14, 0, -1) + circle = box_features(canvas, [3, 3, 34, 34]) + square_canvas = np.full((40, 40), 255, dtype="uint8") + cv2.rectangle(square_canvas, (6, 6), (34, 34), 0, -1) + square = box_features(square_canvas, [3, 3, 34, 34]) + assert circle["circularity"] > square["circularity"] + assert circle["circularity"] > 0.8 + + +def test_classify_icon_detects_radio_from_pixels(): + np = pytest.importorskip("numpy") + cv2 = pytest.importorskip("cv2") + canvas = np.full((40, 40), 255, dtype="uint8") + cv2.circle(canvas, (20, 20), 13, 0, -1) # filled round dot + result = classify_icon(canvas, [4, 4, 32, 32]) + assert result["type"] == "radio" + assert set(result["features"]) == {"aspect", "fill", "edge_density", + "circularity"} + + +def test_box_features_empty_box(): + pytest.importorskip("numpy") + pytest.importorskip("cv2") + import numpy as np + canvas = np.zeros((10, 10), dtype="uint8") + feats = box_features(canvas, [0, 0, 0, 0]) + assert feats == {"aspect": 0.0, "fill": 0.0, "edge_density": 0.0, + "circularity": 0.0} + + +# --- wiring (cv2-free) ---------------------------------------------------- + +def test_executor_pure_classify_path(): + from je_auto_control.utils.executor.action_executor import ( + _classify_widget, + ) + out = _classify_widget('{"aspect": 1.0, "circularity": 0.9, "fill": 0.3}') + assert out["type"] == "radio" + + +def test_wiring(): + known = set(ac.executor.known_commands()) + assert {"AC_classify_widget", "AC_classify_icon"} <= known + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry, + ) + names = {t.name for t in build_default_tool_registry()} + assert {"ac_classify_widget", "ac_classify_icon"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_classify_widget", "AC_classify_icon"} <= specs + + +def test_facade_exports(): + for name in ("classify_widget", "box_features", "classify_icon"): + assert hasattr(ac, name) and name in ac.__all__ From 01ce41eee5ca0f294c9954e2a054471087b06286 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Fri, 26 Jun 2026 10:23:11 +0800 Subject: [PATCH 3/3] Add element_proposal: propose a clean element list from raw pixels Set-of-Marks/observation/grounding assume you already have element boxes, but a game/custom-drawn app/remote desktop has no accessibility tree. propose_elements builds the top-of-funnel list from pixels: widget blobs (Canny+morphology+connected_boxes) + text regions, fused via element_parse (ocr>icon priority is the drop-widget-that-is-really-text cross-check), reading-ordered and tagged text/widget. tag_kinds is the pure labeller. --- WHATS_NEW.md | 6 ++ .../doc/new_features/v220_features_doc.rst | 51 +++++++++++ .../Zh/doc/new_features/v220_features_doc.rst | 42 +++++++++ je_auto_control/__init__.py | 3 + .../gui/script_builder/command_schema.py | 19 +++++ .../utils/element_proposal/__init__.py | 6 ++ .../element_proposal/element_proposal.py | 80 +++++++++++++++++ .../utils/executor/action_executor.py | 19 +++++ .../utils/mcp_server/tools/_factories.py | 25 ++++++ .../utils/mcp_server/tools/_handlers.py | 12 +++ .../headless/test_element_proposal_batch.py | 85 +++++++++++++++++++ 11 files changed, 348 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v220_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v220_features_doc.rst create mode 100644 je_auto_control/utils/element_proposal/__init__.py create mode 100644 je_auto_control/utils/element_proposal/element_proposal.py create mode 100644 test/unit_test/headless/test_element_proposal_batch.py diff --git a/WHATS_NEW.md b/WHATS_NEW.md index 83c0dbe3..18ba3291 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -2,6 +2,12 @@ ## What's new (2026-06-26) +### Template-Free Element Proposal (Pixels to Elements) + +Get a clean numbered element list straight from the screen when there's no accessibility tree. Full reference: [`docs/source/Eng/doc/new_features/v220_features_doc.rst`](docs/source/Eng/doc/new_features/v220_features_doc.rst). + +- **`propose_elements` / `tag_kinds`** (`AC_propose_elements`, `AC_tag_kinds`): Set-of-Marks, `observation` and the grounding helpers all assume you already have element boxes — but a game, a custom-drawn app or a remote desktop has no accessibility tree. `propose_elements` builds that top-of-funnel list from pixels: detect widget boxes (closed-edge blobs via Canny + morphology + `connected_boxes`) and text boxes (`text_regions.find_text_regions`), fuse them — the `element_parse` `ocr > icon` priority *is* the "drop widget-that-is-really-text" cross-check — and return them in reading order, each tagged `text` or `widget`. `tag_kinds` is the pure labeller. cv2 imported lazily; the labeller is fully testable. Seventh and final feature of the ROUND-15 perception lane. No `PySide6`. + ### Classify a Widget from Its Pixel Shape Tell a checkbox from a radio button from a text field — from pixels, no model. Full reference: [`docs/source/Eng/doc/new_features/v219_features_doc.rst`](docs/source/Eng/doc/new_features/v219_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v220_features_doc.rst b/docs/source/Eng/doc/new_features/v220_features_doc.rst new file mode 100644 index 00000000..17956790 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v220_features_doc.rst @@ -0,0 +1,51 @@ +Template-Free Element Proposal (Pixels to Elements) +=================================================== + +Set-of-Marks, ``observation`` and the grounding helpers all assume you already +have a list of element boxes — but on a screen the framework doesn't model +(a game, a custom-drawn app, a remote desktop) there is no accessibility tree to +provide one. ``element_proposal`` builds that top-of-funnel list from pixels: +detect candidate *widget* boxes (closed-edge blobs) and *text* boxes +(:func:`text_regions.find_text_regions`), fuse them — dropping widget boxes that +are really just text — and return them in reading order, each tagged ``text`` or +``widget``. + +* :func:`propose_elements` — the full pixel-to-elements pipeline. +* :func:`tag_kinds` — pure: label fused boxes ``text`` / ``widget`` by source and + keep their reading-order ``index``. + +The fusion / cross-check / ordering reuse :mod:`element_parse` — the ``ocr`` > +``icon`` source priority *is* the "drop widget-that-is-really-text" check — and +the text detection reuses :mod:`text_regions`. ``cv2`` is imported lazily so the +module stays importable; :func:`tag_kinds` is pure and fully testable. Imports no +``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import propose_elements, mark_elements + + # No accessibility tree? Propose elements straight from the screen: + elements = propose_elements(min_area=120) + # [{'box': [x, y, w, h], 'kind': 'widget', 'index': 0}, ...] + + # Feed them to Set-of-Marks like any other element list: + marks = mark_elements(elements) + +``propose_elements`` returns ``[{box, kind, index}]`` in reading order, where +``kind`` is ``text`` or ``widget``. It is the missing top-of-funnel for the +agent stack on un-modelled UIs: pixels in, a clean numbered element list out, +ready for marking, observation or grounding. Tune ``min_area`` for the smallest +control you care about and ``iou_threshold`` for how aggressively overlapping +text and widget boxes are merged. + +Executor commands +----------------- + +``AC_propose_elements`` (``region`` ``[x, y, w, h]`` / ``min_area`` / +``iou_threshold`` → ``{elements}``) runs the full pipeline on the screen, and +``AC_tag_kinds`` (``elements`` JSON list → ``{elements}``, pure) labels a +pre-fused list. They are the matching read-only ``ac_*`` MCP tools and Script +Builder commands under **Image**. diff --git a/docs/source/Zh/doc/new_features/v220_features_doc.rst b/docs/source/Zh/doc/new_features/v220_features_doc.rst new file mode 100644 index 00000000..45554aff --- /dev/null +++ b/docs/source/Zh/doc/new_features/v220_features_doc.rst @@ -0,0 +1,42 @@ +免模板元素提案(像素到元素) +============================ + +Set-of-Marks、``observation`` 與 grounding 輔助函式都假設你已有一份元素方框清單——但在框架無法 +建模的畫面上(遊戲、自繪 app、遠端桌面),並沒有無障礙樹可提供。``element_proposal`` 從像素建立 +這份漏斗頂端清單:偵測候選*控制項*方框(封閉邊緣 blob)與*文字*方框 +(:func:`text_regions.find_text_regions`),將兩者融合——丟棄其實只是文字的控制項方框—— +並依閱讀順序回傳,每個標記為 ``text`` 或 ``widget``。 + +* :func:`propose_elements` ——完整的像素到元素管線。 +* :func:`tag_kinds` ——純函式:依來源把融合後的方框標記 ``text`` / ``widget``,並保留其閱讀順序 ``index``。 + +融合 / 交叉檢查 / 排序重用 :mod:`element_parse`——``ocr`` > ``icon`` 來源優先序*即*「丟棄其實是 +文字的控制項」檢查——文字偵測則重用 :mod:`text_regions`。``cv2`` 採延遲匯入,故模組仍可匯入; +:func:`tag_kinds` 為純函式且可完整測試。不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import propose_elements, mark_elements + + # 沒有無障礙樹?直接從畫面提案元素: + elements = propose_elements(min_area=120) + # [{'box': [x, y, w, h], 'kind': 'widget', 'index': 0}, ...] + + # 像任何元素清單一樣餵給 Set-of-Marks: + marks = mark_elements(elements) + +``propose_elements`` 依閱讀順序回傳 ``[{box, kind, index}]``,``kind`` 為 ``text`` 或 ``widget``。 +它是 agent 堆疊在未建模 UI 上缺少的漏斗頂端:像素進、乾淨的編號元素清單出,可供標記、observation +或 grounding。以 ``min_area`` 調整你在意的最小控制項,以 ``iou_threshold`` 調整重疊文字與控制項 +方框合併的積極程度。 + +執行器指令 +---------- + +``AC_propose_elements``(``region`` ``[x, y, w, h]`` / ``min_area`` / +``iou_threshold`` → ``{elements}``)在畫面上執行完整管線,``AC_tag_kinds`` +(``elements`` JSON 清單 → ``{elements}``,純函式)則標記預先融合的清單。皆以對應的唯讀 +``ac_*`` MCP 工具及 Script Builder 指令(位於 **Image** 分類下)形式提供。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 54b3f178..1ccf1d6d 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -149,6 +149,8 @@ from je_auto_control.utils.icon_classify import ( box_features, classify_icon, classify_widget, ) +# Propose a clean element list from raw pixels (template-free) +from je_auto_control.utils.element_proposal import propose_elements, tag_kinds # Rich clipboard formats — RTF + CSV/TSV codecs and Windows get / set from je_auto_control.utils.clipboard_rich_formats import ( build_rtf, csv_to_rows, get_clipboard_csv, get_clipboard_rtf, rows_to_csv, @@ -1779,6 +1781,7 @@ def start_autocontrol_gui(*args, **kwargs): "normalize_theme", "match_theme", "localize_changes", "rank_changes", "classify_widget", "box_features", "classify_icon", + "propose_elements", "tag_kinds", "build_rtf", "rtf_to_text", "rows_to_csv", "csv_to_rows", "set_clipboard_rtf", "get_clipboard_rtf", "set_clipboard_csv", "get_clipboard_csv", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index f642c163..50e2031b 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -4648,6 +4648,25 @@ def _add_work_queue_specs(specs: List[CommandSpec]) -> None: ), description="Classify the widget in an image box from its pixels.", )) + specs.append(CommandSpec( + "AC_propose_elements", "Image", "Propose Elements (template-free)", + fields=( + FieldSpec("region", FieldType.STRING, optional=True, + placeholder="[x, y, w, h]"), + FieldSpec("min_area", FieldType.INT, optional=True, default=80), + FieldSpec("iou_threshold", FieldType.FLOAT, optional=True, + default=0.5), + ), + description="Propose text/widget element boxes from raw screen pixels.", + )) + specs.append(CommandSpec( + "AC_tag_kinds", "Image", "Tag Element Kinds", + fields=( + FieldSpec("elements", FieldType.STRING, + placeholder="JSON list of fused boxes"), + ), + description="Label fused element boxes text/widget by source.", + )) specs.append(CommandSpec( "AC_normalize_ext", "Shell", "Normalize Extension", fields=( diff --git a/je_auto_control/utils/element_proposal/__init__.py b/je_auto_control/utils/element_proposal/__init__.py new file mode 100644 index 00000000..103cb9f0 --- /dev/null +++ b/je_auto_control/utils/element_proposal/__init__.py @@ -0,0 +1,6 @@ +"""Propose a clean element list from raw pixels, with no template or model.""" +from je_auto_control.utils.element_proposal.element_proposal import ( + propose_elements, tag_kinds, +) + +__all__ = ["propose_elements", "tag_kinds"] diff --git a/je_auto_control/utils/element_proposal/element_proposal.py b/je_auto_control/utils/element_proposal/element_proposal.py new file mode 100644 index 00000000..3b08b274 --- /dev/null +++ b/je_auto_control/utils/element_proposal/element_proposal.py @@ -0,0 +1,80 @@ +"""Propose a clean element list from raw pixels, with no template or model. + +Set-of-Marks, ``observation`` and the grounding helpers all assume you already +have a list of element boxes — but on a screen the framework doesn't model +(a game, a custom-drawn app, a remote desktop) there is no accessibility tree to +provide one. ``element_proposal`` builds that top-of-funnel list from pixels: +detect candidate *widget* boxes (closed-edge blobs) and *text* boxes +(:func:`text_regions.find_text_regions`), fuse them — dropping widget boxes that +are really just text — and return them in reading order, each tagged ``text`` or +``widget``. + +* :func:`propose_elements` — the full pixel-to-elements pipeline. +* :func:`tag_kinds` — pure: label fused boxes ``text`` / ``widget`` by source and + keep their reading-order ``index``. + +The fusion / cross-check / ordering reuse :mod:`element_parse` (the ``ocr`` > +``icon`` priority *is* the "drop widget-that-is-really-text" check) and +:mod:`text_regions`; ``cv2`` is imported lazily so the module stays importable. +:func:`tag_kinds` is pure and fully testable. Imports no ``PySide6``. +""" +from typing import Any, Dict, List, Optional, Sequence + +# Reading-order source tag to element kind. +_KIND_BY_SOURCE = {"ocr": "text", "icon": "widget", "a11y": "element"} + + +def tag_kinds(elements: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Label fused boxes ``text`` / ``widget`` by source (pure). + + Each input box carries a ``source`` (``ocr`` / ``icon``) and an ``index`` + from :func:`element_parse.reading_order`. Returns ``[{box, kind, index}]``. + """ + result: List[Dict[str, Any]] = [] + for element in elements: + box = [int(element["x"]), int(element["y"]), + int(element["width"]), int(element["height"])] + kind = _KIND_BY_SOURCE.get(element.get("source"), "widget") + result.append({"box": box, "kind": kind, "index": element.get("index")}) + return result + + +def _reasonable(box: Dict[str, Any], frame_w: int, frame_h: int) -> bool: + """Keep plausibly-widget blobs: not the whole frame, not a thin rule.""" + width, height = int(box["width"]), int(box["height"]) + if width >= 0.95 * frame_w and height >= 0.95 * frame_h: + return False + aspect = width / height if height else 0.0 + return 0.05 <= aspect <= 15.0 + + +def _widget_boxes(gray: Any, min_area: int) -> List[Dict[str, Any]]: + """Detect candidate widget boxes as closed-edge blobs (cv2).""" + import cv2 + from je_auto_control.utils.cv2_utils.blobs import connected_boxes + edges = cv2.Canny(gray, 50, 150) + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) + closed = cv2.morphologyEx(edges, cv2.MORPH_CLOSE, kernel) + height, width = gray.shape[:2] + return [box for box in connected_boxes(closed, min_area=int(min_area)) + if _reasonable(box, width, height)] + + +def propose_elements(source: Optional[Any] = None, *, + region: Optional[Sequence[int]] = None, min_area: int = 80, + iou_threshold: float = 0.5) -> List[Dict[str, Any]]: + """Propose ``text`` / ``widget`` element boxes from pixels, in reading order. + + Detects widget blobs and text regions on ``source`` (a fresh screen grab of + ``region`` by default), fuses them (overlapping text wins over widget), and + orders them. Returns ``[{box, kind, index}]``. + """ + from je_auto_control.utils.element_parse import fuse_elements, reading_order + from je_auto_control.utils.text_regions import find_text_regions + from je_auto_control.utils.visual_match.visual_match import _haystack_gray + gray = _haystack_gray(source, region) + text = find_text_regions(gray, min_area=int(min_area)) + widgets = _widget_boxes(gray, int(min_area)) + fused = fuse_elements(ocr_boxes=text, icon_boxes=widgets, + iou_threshold=float(iou_threshold)) + return tag_kinds(reading_order(fused)) diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 62168052..7f5a8761 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -2946,6 +2946,23 @@ def _classify_icon(source: Any, box: Any) -> Dict[str, Any]: return classify_icon(str(source), _coerce_list(box)) +def _propose_elements(region: Any = None, min_area: Any = 80, + iou_threshold: Any = 0.5) -> Dict[str, Any]: + """Adapter: propose text/widget element boxes from pixels (device).""" + from je_auto_control.utils.element_proposal import propose_elements + elements = propose_elements(region=_coerce_region(region), + min_area=int(min_area), + iou_threshold=float(iou_threshold)) + return {"elements": elements} + + +def _tag_kinds(elements: Any) -> Dict[str, Any]: + """Adapter: label fused boxes text/widget by source (pure).""" + from je_auto_control.utils.element_proposal import tag_kinds + items = _coerce_list(elements) if elements else [] + return {"elements": tag_kinds(items)} + + def _normalize_ext(target: str) -> Dict[str, Any]: """Adapter: the lowercased extension of a path / bare ext (pure).""" from je_auto_control.utils.file_assoc import normalize_ext @@ -6989,6 +7006,8 @@ def __init__(self): "AC_localize_changes": _localize_changes, "AC_classify_widget": _classify_widget, "AC_classify_icon": _classify_icon, + "AC_propose_elements": _propose_elements, + "AC_tag_kinds": _tag_kinds, "AC_normalize_ext": _normalize_ext, "AC_file_association": _file_association, "AC_get_control_text": _get_control_text, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 3b11e843..26c2427e 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -4159,6 +4159,31 @@ def img_histogram_tools() -> List[MCPTool]: handler=h.classify_icon, annotations=READ_ONLY, ), + MCPTool( + name="ac_propose_elements", + description=("Propose text/widget element boxes from raw screen " + "pixels (template-free): detect widget blobs + text " + "regions, fuse, order. 'region' [x,y,w,h] clips. " + "Returns {elements:[{box, kind, index}]}."), + input_schema=schema({"region": {"type": "array", + "items": {"type": "integer"}}, + "min_area": {"type": "integer"}, + "iou_threshold": {"type": "number"}}), + handler=h.propose_elements, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_tag_kinds", + description=("Label fused element boxes 'text'/'widget' by source " + "(pure). 'elements' is a list of {x,y,width,height," + "source,index}. Returns {elements:[{box, kind, " + "index}]}."), + input_schema=schema({"elements": {"type": "array", + "items": {"type": "object"}}}, + required=["elements"]), + handler=h.tag_kinds, + annotations=READ_ONLY, + ), ] diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 66875681..91d38a68 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -797,6 +797,18 @@ def classify_icon(source, box): return _classify_icon(source, box) +def propose_elements(region=None, min_area=80, iou_threshold=0.5): + from je_auto_control.utils.executor.action_executor import ( + _propose_elements, + ) + return _propose_elements(region, min_area, iou_threshold) + + +def tag_kinds(elements): + from je_auto_control.utils.executor.action_executor import _tag_kinds + return _tag_kinds(elements) + + def normalize_ext(target): from je_auto_control.utils.executor.action_executor import _normalize_ext return _normalize_ext(target) diff --git a/test/unit_test/headless/test_element_proposal_batch.py b/test/unit_test/headless/test_element_proposal_batch.py new file mode 100644 index 00000000..c88fe950 --- /dev/null +++ b/test/unit_test/headless/test_element_proposal_batch.py @@ -0,0 +1,85 @@ +"""Headless tests for element_proposal (pure tag_kinds + cv2 pipeline).""" +import pytest + +import je_auto_control as ac +from je_auto_control.utils.element_proposal import propose_elements, tag_kinds + + +# --- pure tag_kinds ------------------------------------------------------- + +def test_tag_kinds_labels_by_source(): + fused = [ + {"x": 0, "y": 0, "width": 30, "height": 12, "source": "ocr", + "index": 0}, + {"x": 0, "y": 20, "width": 16, "height": 16, "source": "icon", + "index": 1}, + ] + tagged = tag_kinds(fused) + assert tagged[0] == {"box": [0, 0, 30, 12], "kind": "text", "index": 0} + assert tagged[1] == {"box": [0, 20, 16, 16], "kind": "widget", "index": 1} + + +def test_tag_kinds_unknown_source_is_widget(): + tagged = tag_kinds([{"x": 1, "y": 2, "width": 3, "height": 4}]) + assert tagged[0]["kind"] == "widget" + assert tagged[0]["box"] == [1, 2, 3, 4] + + +def test_tag_kinds_empty(): + assert tag_kinds([]) == [] + + +# --- cv2 propose_elements (per-function importorskip) --------------------- + +def test_propose_elements_finds_widgets(): + np = pytest.importorskip("numpy") + cv2 = pytest.importorskip("cv2") + canvas = np.full((200, 240), 245, dtype="uint8") + # three distinct outlined "widgets" + cv2.rectangle(canvas, (20, 20), (90, 60), 0, 2) + cv2.rectangle(canvas, (130, 30), (210, 70), 0, 2) + cv2.rectangle(canvas, (40, 110), (200, 160), 0, 2) + elements = propose_elements(canvas, min_area=120) + assert len(elements) >= 2 + # every element is well-formed and in reading order + for position, element in enumerate(elements): + assert set(element) == {"box", "kind", "index"} + assert element["index"] == position + assert element["kind"] in ("text", "widget") + assert len(element["box"]) == 4 + # nothing spans the whole frame + assert all(not (e["box"][2] >= 228 and e["box"][3] >= 190) + for e in elements) + + +def test_propose_elements_blank_screen_is_empty_or_small(): + np = pytest.importorskip("numpy") + pytest.importorskip("cv2") + blank = np.full((120, 120), 255, dtype="uint8") + assert propose_elements(blank, min_area=200) == [] + + +# --- wiring (cv2-free) ---------------------------------------------------- + +def test_executor_pure_tag_path(): + from je_auto_control.utils.executor.action_executor import _tag_kinds + out = _tag_kinds('[{"x":0,"y":0,"width":10,"height":10,"source":"icon"}]') + assert out["elements"][0]["kind"] == "widget" + + +def test_wiring(): + known = set(ac.executor.known_commands()) + assert {"AC_propose_elements", "AC_tag_kinds"} <= known + from je_auto_control.utils.mcp_server.tools import ( + build_default_tool_registry, + ) + names = {t.name for t in build_default_tool_registry()} + assert {"ac_propose_elements", "ac_tag_kinds"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_propose_elements", "AC_tag_kinds"} <= specs + + +def test_facade_exports(): + for name in ("propose_elements", "tag_kinds"): + assert hasattr(ac, name) and name in ac.__all__