diff --git a/WHATS_NEW.md b/WHATS_NEW.md index 74f81365..b5e8ffbd 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -1,5 +1,11 @@ # What's New — AutoControl +## What's new (2026-06-25) — Run-Trace Diff (what changed between two executions) + +See exactly what changed between a passing run and a failing one. Full reference: [`docs/source/Eng/doc/new_features/v192_features_doc.rst`](docs/source/Eng/doc/new_features/v192_features_doc.rst). + +- **`diff_runs` / `summarize_run_diff`** (`AC_diff_runs`): a run history says a run *failed* but not *what changed* from the run that passed. This aligns two step sequences with a longest-common-subsequence walk (so an inserted/removed step shifts the rest into place instead of mis-pairing everything) and classifies the differences: **added**/**removed** steps, **status_flips** (an aligned step that changed status — with the new failure's `failure_signature` when it carries an error), and **timing_regressions** (a step that got `regress_factor`× slower). `summarize_run_diff` renders a one-line summary. Pure stdlib over lists of `{name,status,duration,error}` step dicts. No `PySide6`. + ## What's new (2026-06-25) — Stable Failure Signatures Match the *same kind* of failure across runs, despite differing paths and ids. Full reference: [`docs/source/Eng/doc/new_features/v191_features_doc.rst`](docs/source/Eng/doc/new_features/v191_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v192_features_doc.rst b/docs/source/Eng/doc/new_features/v192_features_doc.rst new file mode 100644 index 00000000..d717e2e1 --- /dev/null +++ b/docs/source/Eng/doc/new_features/v192_features_doc.rst @@ -0,0 +1,48 @@ +Run-Trace Diff (what changed between two executions) +==================================================== + +A run history tells you a run *failed*, but not *what changed* from the run that +passed: which step was added or dropped, which step flipped pass→fail, which step +got slower. ``run_diff`` aligns the two step sequences with a longest-common- +subsequence walk — so an inserted or removed step shifts the rest into place +instead of mis-pairing everything — and classifies the differences: + +* **added** / **removed** — steps present in only one run, +* **status_flips** — an aligned step whose status changed, with the new failure's + :func:`failure_signature` when it carries an ``error``, +* **timing_regressions** — an aligned step that got ``regress_factor`` x slower. + +A step is any dict with a name key (default ``"name"``) and optional ``status`` / +``duration`` / ``error``. Pure standard library; no device, no ``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import diff_runs, summarize_run_diff + + before = [{"name": "login", "status": "ok", "duration": 1.0}, + {"name": "submit", "status": "ok", "duration": 1.0}] + after = [{"name": "login", "status": "ok", "duration": 1.1}, + {"name": "accept_cookies", "status": "ok"}, # inserted + {"name": "submit", "status": "error", "error": "Timeout ..."}] + + diff = diff_runs(before, after) + # {"added": [accept_cookies], "removed": [], + # "status_flips": [{"name": "submit", "from": "ok", "to": "error", + # "signature": "..."}], + # "timing_regressions": [], "aligned": 2, "identical": False} + + summarize_run_diff(diff) # "+1 added, 1 status flip(s)" + +``regress_factor`` (default ``1.5``) is the slowdown ratio that counts as a +regression; ``key`` selects the field steps are aligned on. ``summarize_run_diff`` +renders a one-line summary (``"no change"`` when identical). + +Executor commands +----------------- + +``AC_diff_runs`` (``before`` / ``after`` / ``key`` / ``regress_factor``) returns +the diff plus a ``summary`` field. It is exposed as the read-only ``ac_diff_runs`` +MCP tool and as a Script Builder command under **Testing**. diff --git a/docs/source/Zh/doc/new_features/v192_features_doc.rst b/docs/source/Zh/doc/new_features/v192_features_doc.rst new file mode 100644 index 00000000..73116cf0 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v192_features_doc.rst @@ -0,0 +1,45 @@ +執行軌跡比較(兩次執行之間改變了什麼) +====================================== + +執行歷史告訴你某次執行*失敗*了,卻不告訴你相較於通過的那次*改變了什麼*:哪個步驟被加入或移除、 +哪個步驟由通過翻轉成失敗、哪個步驟變慢了。``run_diff`` 以最長共同子序列(LCS)走訪對齊兩個步驟 +序列——這樣插入或移除一個步驟會把其餘步驟順移到位,而非整個錯位配對——並將差異分類: + +* **added** / **removed** ——只存在於其中一次執行的步驟, +* **status_flips** ——某個已對齊步驟的狀態改變,若帶有 ``error`` 則附上新失敗的 + :func:`failure_signature`, +* **timing_regressions** ——某個已對齊步驟變慢了 ``regress_factor`` 倍。 + +步驟可為任何帶有名稱鍵(預設 ``"name"``)與選填 ``status`` / ``duration`` / ``error`` 的字典。 +純標準庫;不涉及裝置,不匯入 ``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import diff_runs, summarize_run_diff + + before = [{"name": "login", "status": "ok", "duration": 1.0}, + {"name": "submit", "status": "ok", "duration": 1.0}] + after = [{"name": "login", "status": "ok", "duration": 1.1}, + {"name": "accept_cookies", "status": "ok"}, # 插入 + {"name": "submit", "status": "error", "error": "Timeout ..."}] + + diff = diff_runs(before, after) + # {"added": [accept_cookies], "removed": [], + # "status_flips": [{"name": "submit", "from": "ok", "to": "error", + # "signature": "..."}], + # "timing_regressions": [], "aligned": 2, "identical": False} + + summarize_run_diff(diff) # "+1 added, 1 status flip(s)" + +``regress_factor``(預設 ``1.5``)是算作退化的變慢比率;``key`` 選擇步驟對齊所依據的欄位。 +``summarize_run_diff`` 產生一行摘要(相同時為 ``"no change"``)。 + +執行器指令 +---------- + +``AC_diff_runs``(``before`` / ``after`` / ``key`` / ``regress_factor``)回傳該差異並附帶 +``summary`` 欄位。以唯讀 ``ac_diff_runs`` MCP 工具及 Script Builder 指令(位於 **Testing** +分類下)形式提供。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 446a003b..35482895 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -92,6 +92,8 @@ from je_auto_control.utils.failure_signature import ( failure_signature, group_failures, normalize_error, ) +# Run-trace diff (LCS-aligned: added/removed steps, status flips, regressions) +from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff # VLM element locator (headless) from je_auto_control.utils.vision import ( VLMNotAvailableError, click_by_description, locate_by_description, @@ -1670,6 +1672,7 @@ def start_autocontrol_gui(*args, **kwargs): "detect_scale", "scale_sweep", "saliency_map", "salient_regions", "most_salient", "normalize_error", "failure_signature", "group_failures", + "diff_runs", "summarize_run_diff", # VLM locator "VLMNotAvailableError", "locate_by_description", "click_by_description", "verify_description", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 1ebbb836..9dc991d9 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -2723,6 +2723,19 @@ def _add_audit_specs(specs: List[CommandSpec]) -> None: placeholder='["err one", "err two"]'),), description="Group error messages by failure signature (most frequent).", )) + specs.append(CommandSpec( + "AC_diff_runs", "Testing", "Diff Two Run Traces", + fields=( + FieldSpec("before", FieldType.STRING, + placeholder='[{"name": "login", "status": "ok"}]'), + FieldSpec("after", FieldType.STRING, + placeholder='[{"name": "login", "status": "error"}]'), + FieldSpec("key", FieldType.STRING, optional=True, default="name"), + FieldSpec("regress_factor", FieldType.FLOAT, optional=True, + default=1.5), + ), + description="LCS-align two run step-traces: added/removed/flips/regress.", + )) specs.append(CommandSpec( "AC_scan_secrets", "Tools", "Scan for Hardcoded Secrets", description="Scan 'data' (JSON view) for hardcoded secrets that " diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 304a9729..4c1706c6 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -4366,6 +4366,20 @@ def _group_failures(errors: Any) -> Dict[str, Any]: return {"groups": groups, "count": len(groups)} +def _diff_runs(before: Any, after: Any, key: str = "name", + regress_factor: Any = 1.5) -> Dict[str, Any]: + """Adapter: diff two run step-traces (added/removed/flips/regressions).""" + import json + from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff + if isinstance(before, str): + before = json.loads(before) + if isinstance(after, str): + after = json.loads(after) + diff = diff_runs(before, after, key=str(key), + regress_factor=float(regress_factor)) + return {**diff, "summary": summarize_run_diff(diff)} + + def _image_histogram(source: Any = None, bins: Any = 32, space: str = "hsv", region: Any = None) -> Dict[str, Any]: """Adapter: per-channel colour histogram of an image / the screen.""" @@ -6596,6 +6610,7 @@ def __init__(self): "AC_most_salient": _most_salient, "AC_failure_signature": _failure_signature, "AC_group_failures": _group_failures, + "AC_diff_runs": _diff_runs, "AC_image_histogram": _image_histogram, "AC_histogram_changed": _histogram_changed, "AC_changed_regions": _changed_regions, diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index 8a3ddfbc..a0934ecb 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -7675,6 +7675,22 @@ def flakiness_tools() -> List[MCPTool]: handler=h.group_failures, annotations=READ_ONLY, ), + MCPTool( + name="ac_diff_runs", + description=("Diff two run step-traces (lists of {name,status," + "duration,error}) — LCS-aligned so inserts shift rather " + "than mis-pair. Returns {added, removed, status_flips " + "(with failure signature), timing_regressions, aligned, " + "identical, summary}. 'regress_factor' = slowdown ratio."), + input_schema=schema({ + "before": {"type": "array", "items": {"type": "object"}}, + "after": {"type": "array", "items": {"type": "object"}}, + "key": {"type": "string"}, + "regress_factor": {"type": "number"}}, + required=["before", "after"]), + handler=h.diff_runs, + annotations=READ_ONLY, + ), ] diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 552793a8..0914842b 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -2553,6 +2553,11 @@ def group_failures(errors): return _group_failures(errors) +def diff_runs(before, after, key="name", regress_factor=1.5): + from je_auto_control.utils.executor.action_executor import _diff_runs + return _diff_runs(before, after, key, regress_factor) + + def image_histogram(source=None, bins=32, space="hsv", region=None): from je_auto_control.utils.executor.action_executor import _image_histogram return _image_histogram(source, bins, space, region) diff --git a/je_auto_control/utils/run_diff/__init__.py b/je_auto_control/utils/run_diff/__init__.py new file mode 100644 index 00000000..05cee29c --- /dev/null +++ b/je_auto_control/utils/run_diff/__init__.py @@ -0,0 +1,4 @@ +"""Diff two run traces (LCS-aligned step diff: added/removed/flips/regressions).""" +from je_auto_control.utils.run_diff.run_diff import diff_runs, summarize_run_diff + +__all__ = ["diff_runs", "summarize_run_diff"] diff --git a/je_auto_control/utils/run_diff/run_diff.py b/je_auto_control/utils/run_diff/run_diff.py new file mode 100644 index 00000000..26b2f494 --- /dev/null +++ b/je_auto_control/utils/run_diff/run_diff.py @@ -0,0 +1,119 @@ +"""Diff two run traces — what changed between two executions of a flow. + +A run history tells you a run *failed*, but not *what changed* from the run that +passed: which step was added or dropped, which step flipped pass→fail, which step +got slower. ``run_diff`` aligns the two step sequences with a longest-common- +subsequence walk (so an inserted / removed step shifts the rest into place instead +of mis-pairing everything) and classifies the differences: + +* **added** / **removed** steps (present in only one run), +* **status flips** (an aligned step whose status changed — with the new failure's + :func:`failure_signature` when it carries an ``error``), +* **timing regressions** (an aligned step that got ``regress_factor`` x slower). + +A step is any dict with a name key (default ``"name"``) and optional ``status`` / +``duration`` / ``error``. Pure standard library; no device, no ``PySide6``. +""" +from typing import Any, Dict, List, Sequence + +Step = Dict[str, Any] + + +def _lcs_pairs(left: Sequence[str], right: Sequence[str]) -> List[tuple]: + """Return aligned ``(i, j)`` index pairs of the longest common subsequence.""" + n, m = len(left), len(right) + table = [[0] * (m + 1) for _ in range(n + 1)] + for i in range(n - 1, -1, -1): + for j in range(m - 1, -1, -1): + if left[i] == right[j]: + table[i][j] = table[i + 1][j + 1] + 1 + else: + table[i][j] = max(table[i + 1][j], table[i][j + 1]) + pairs, i, j = [], 0, 0 + while i < n and j < m: + if left[i] == right[j]: + pairs.append((i, j)) + i, j = i + 1, j + 1 + elif table[i + 1][j] >= table[i][j + 1]: + i += 1 + else: + j += 1 + return pairs + + +def _status_flip(before: Step, after: Step, name: str) -> Dict[str, Any]: + """Build a status-flip record, attaching a signature for a new error.""" + flip = {"name": name, "from": before.get("status"), + "to": after.get("status")} + error = after.get("error") + if error: + from je_auto_control.utils.failure_signature import failure_signature + flip["signature"] = failure_signature(str(error)) + return flip + + +def _regression(before: Step, after: Step, name: str, + factor: float) -> Dict[str, Any]: + """Return a timing-regression record, or ``{}`` if not a regression.""" + prev, curr = before.get("duration"), after.get("duration") + if not isinstance(prev, (int, float)) or not isinstance(curr, (int, float)): + return {} + if prev > 0 and curr >= prev * float(factor): + return {"name": name, "before": float(prev), "after": float(curr), + "ratio": round(curr / prev, 3)} + return {} + + +def _aligned_changes(before: Sequence[Step], after: Sequence[Step], + names: Sequence[str], pairs: List[tuple], + factor: float) -> tuple: + """Classify the LCS-aligned pairs into (status flips, timing regressions).""" + flips, regressions = [], [] + for i, j in pairs: + if before[i].get("status") != after[j].get("status"): + flips.append(_status_flip(before[i], after[j], names[i])) + regression = _regression(before[i], after[j], names[i], factor) + if regression: + regressions.append(regression) + return flips, regressions + + +def _keys(steps: Sequence[Step], key: str) -> List[str]: + return [str(step.get(key, "")) for step in steps] + + +def _unmatched(steps: Sequence[Step], matched: set) -> List[Step]: + return [steps[k] for k in range(len(steps)) if k not in matched] + + +def diff_runs(before: Sequence[Step], after: Sequence[Step], *, + key: str = "name", regress_factor: float = 1.5) -> Dict[str, Any]: + """Diff two step sequences into ``{added, removed, status_flips, + timing_regressions, aligned, identical}``. + + Steps are aligned by their ``key`` value via LCS; ``regress_factor`` is the + slowdown ratio that counts as a timing regression. + """ + left, right = _keys(before, key), _keys(after, key) + pairs = _lcs_pairs(left, right) + flips, regressions = _aligned_changes(before, after, left, pairs, + regress_factor) + added = _unmatched(after, {j for _, j in pairs}) + removed = _unmatched(before, {i for i, _ in pairs}) + return {"added": added, "removed": removed, "status_flips": flips, + "timing_regressions": regressions, "aligned": len(pairs), + "identical": not any((added, removed, flips, regressions))} + + +def summarize_run_diff(diff: Dict[str, Any]) -> str: + """Render a one-line human summary of a :func:`diff_runs` result.""" + if diff.get("identical"): + return "no change" + parts = [] + for label, field in (("+{} added", "added"), ("-{} removed", "removed"), + ("{} status flip(s)", "status_flips"), + ("{} regression(s)", "timing_regressions")): + count = len(diff.get(field, [])) + if count: + parts.append(label.format(count)) + return ", ".join(parts) diff --git a/test/unit_test/headless/test_run_diff_batch.py b/test/unit_test/headless/test_run_diff_batch.py new file mode 100644 index 00000000..a705afdc --- /dev/null +++ b/test/unit_test/headless/test_run_diff_batch.py @@ -0,0 +1,93 @@ +"""Headless tests for run-trace diffing (LCS-aligned step diff).""" +import pytest + +import je_auto_control as ac +from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff + + +def _before(): + return [ + {"name": "login", "status": "ok", "duration": 1.0}, + {"name": "open_form", "status": "ok", "duration": 2.0}, + {"name": "submit", "status": "ok", "duration": 1.0}, + ] + + +def _after(): + return [ + {"name": "login", "status": "ok", "duration": 1.1}, + {"name": "accept_cookies", "status": "ok", "duration": 0.5}, # inserted + {"name": "open_form", "status": "ok", "duration": 5.0}, # 2.5x slower + {"name": "submit", "status": "error", "duration": 1.0, + "error": r"Timeout at C:\app.py line 42 (0x7ff)"}, # flip + ] + + +def test_lcs_alignment_isolates_the_insert(): + diff = diff_runs(_before(), _after()) + # the inserted step is the only add; login/open_form/submit stay aligned + assert [s["name"] for s in diff["added"]] == ["accept_cookies"] + assert diff["removed"] == [] + assert diff["aligned"] == 3 + assert diff["identical"] is False + + +def test_status_flip_carries_failure_signature(): + flips = diff_runs(_before(), _after())["status_flips"] + assert len(flips) == 1 + assert flips[0]["name"] == "submit" + assert flips[0]["from"] == "ok" and flips[0]["to"] == "error" + assert len(flips[0]["signature"]) == 12 # failure_signature attached + + +def test_timing_regression_detected_with_ratio(): + regs = diff_runs(_before(), _after())["timing_regressions"] + assert len(regs) == 1 and regs[0]["name"] == "open_form" + assert regs[0]["ratio"] == pytest.approx(2.5) + # a small slowdown under the factor is not a regression + slow = diff_runs([{"name": "a", "duration": 1.0}], + [{"name": "a", "duration": 1.2}]) + assert slow["timing_regressions"] == [] + + +def test_removed_step_detected(): + diff = diff_runs(_before(), [_before()[0], _before()[2]]) # drop open_form + assert [s["name"] for s in diff["removed"]] == ["open_form"] + assert diff["added"] == [] + + +def test_identical_runs(): + diff = diff_runs(_before(), _before()) + assert diff["identical"] is True + assert summarize_run_diff(diff) == "no change" + + +def test_summary_lists_changes(): + summary = summarize_run_diff(diff_runs(_before(), _after())) + assert "added" in summary and "flip" in summary and "regression" in summary + + +# --- wiring --------------------------------------------------------------- + +def test_executor_path_includes_summary(): + import json + from je_auto_control.utils.executor.action_executor import _diff_runs + out = _diff_runs(json.dumps(_before()), json.dumps(_after())) + assert out["aligned"] == 3 + assert out["summary"] == summarize_run_diff(diff_runs(_before(), _after())) + + +def test_wiring(): + known = set(ac.executor.known_commands()) + assert "AC_diff_runs" in known + from je_auto_control.utils.mcp_server.tools import build_default_tool_registry + names = {t.name for t in build_default_tool_registry()} + assert "ac_diff_runs" in names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert "AC_diff_runs" in specs + + +def test_facade_exports(): + for name in ("diff_runs", "summarize_run_diff"): + assert hasattr(ac, name) and name in ac.__all__