From aee77c4a02206b5029e81d43364fb7bc10082d77 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Thu, 25 Jun 2026 01:40:02 +0800
Subject: [PATCH] Add run_diff: LCS-aligned diff of two run step-traces

A run history says a run failed but not what changed from the run that
passed. Align two step sequences with an LCS walk (so an inserted or
removed step shifts the rest into place instead of mis-pairing) and
classify the differences: added/removed steps, status flips (with the
new failure's signature), and timing regressions. summarize_run_diff
renders a one-line summary. Pure stdlib over step dicts.
---
 WHATS_NEW.md                                  |   6 +
 .../doc/new_features/v192_features_doc.rst    |  48 +++++++
 .../Zh/doc/new_features/v192_features_doc.rst |  45 +++++++
 je_auto_control/__init__.py                   |   3 +
 .../gui/script_builder/command_schema.py      |  13 ++
 .../utils/executor/action_executor.py         |  15 +++
 .../utils/mcp_server/tools/_factories.py      |  16 +++
 .../utils/mcp_server/tools/_handlers.py       |   5 +
 je_auto_control/utils/run_diff/__init__.py    |   4 +
 je_auto_control/utils/run_diff/run_diff.py    | 119 ++++++++++++++++++
 .../unit_test/headless/test_run_diff_batch.py |  93 ++++++++++++++
 11 files changed, 367 insertions(+)
 create mode 100644 docs/source/Eng/doc/new_features/v192_features_doc.rst
 create mode 100644 docs/source/Zh/doc/new_features/v192_features_doc.rst
 create mode 100644 je_auto_control/utils/run_diff/__init__.py
 create mode 100644 je_auto_control/utils/run_diff/run_diff.py
 create mode 100644 test/unit_test/headless/test_run_diff_batch.py

diff --git a/WHATS_NEW.md b/WHATS_NEW.md
index 74f81365..b5e8ffbd 100644
--- a/WHATS_NEW.md
+++ b/WHATS_NEW.md
@@ -1,5 +1,11 @@
 # What's New — AutoControl
 
+## What's new (2026-06-25) — Run-Trace Diff (what changed between two executions)
+
+See exactly what changed between a passing run and a failing one. Full reference: [`docs/source/Eng/doc/new_features/v192_features_doc.rst`](docs/source/Eng/doc/new_features/v192_features_doc.rst).
+
+- **`diff_runs` / `summarize_run_diff`** (`AC_diff_runs`): a run history says a run *failed* but not *what changed* from the run that passed. This aligns two step sequences with a longest-common-subsequence walk (so an inserted/removed step shifts the rest into place instead of mis-pairing everything) and classifies the differences: **added**/**removed** steps, **status_flips** (an aligned step that changed status — with the new failure's `failure_signature` when it carries an error), and **timing_regressions** (a step that got `regress_factor`× slower). `summarize_run_diff` renders a one-line summary. Pure stdlib over lists of `{name,status,duration,error}` step dicts. No `PySide6`.
+
 ## What's new (2026-06-25) — Stable Failure Signatures
 
 Match the *same kind* of failure across runs, despite differing paths and ids. Full reference: [`docs/source/Eng/doc/new_features/v191_features_doc.rst`](docs/source/Eng/doc/new_features/v191_features_doc.rst).
diff --git a/docs/source/Eng/doc/new_features/v192_features_doc.rst b/docs/source/Eng/doc/new_features/v192_features_doc.rst
new file mode 100644
index 00000000..d717e2e1
--- /dev/null
+++ b/docs/source/Eng/doc/new_features/v192_features_doc.rst
@@ -0,0 +1,48 @@
+Run-Trace Diff (what changed between two executions)
+====================================================
+
+A run history tells you a run *failed*, but not *what changed* from the run that
+passed: which step was added or dropped, which step flipped pass→fail, which step
+got slower. ``run_diff`` aligns the two step sequences with a longest-common-
+subsequence walk — so an inserted or removed step shifts the rest into place
+instead of mis-pairing everything — and classifies the differences:
+
+* **added** / **removed** — steps present in only one run,
+* **status_flips** — an aligned step whose status changed, with the new failure's
+  :func:`failure_signature` when it carries an ``error``,
+* **timing_regressions** — an aligned step that got ``regress_factor`` x slower.
+
+A step is any dict with a name key (default ``"name"``) and optional ``status`` /
+``duration`` / ``error``. Pure standard library; no device, no ``PySide6``.
+
+Headless API
+------------
+
+.. code-block:: python
+
+    from je_auto_control import diff_runs, summarize_run_diff
+
+    before = [{"name": "login", "status": "ok", "duration": 1.0},
+              {"name": "submit", "status": "ok", "duration": 1.0}]
+    after = [{"name": "login", "status": "ok", "duration": 1.1},
+             {"name": "accept_cookies", "status": "ok"},          # inserted
+             {"name": "submit", "status": "error", "error": "Timeout ..."}]
+
+    diff = diff_runs(before, after)
+    # {"added": [accept_cookies], "removed": [],
+    #  "status_flips": [{"name": "submit", "from": "ok", "to": "error",
+    #                    "signature": "..."}],
+    #  "timing_regressions": [], "aligned": 2, "identical": False}
+
+    summarize_run_diff(diff)        # "+1 added, 1 status flip(s)"
+
+``regress_factor`` (default ``1.5``) is the slowdown ratio that counts as a
+regression; ``key`` selects the field steps are aligned on. ``summarize_run_diff``
+renders a one-line summary (``"no change"`` when identical).
+
+Executor commands
+-----------------
+
+``AC_diff_runs`` (``before`` / ``after`` / ``key`` / ``regress_factor``) returns
+the diff plus a ``summary`` field. It is exposed as the read-only ``ac_diff_runs``
+MCP tool and as a Script Builder command under **Testing**.
diff --git a/docs/source/Zh/doc/new_features/v192_features_doc.rst b/docs/source/Zh/doc/new_features/v192_features_doc.rst
new file mode 100644
index 00000000..73116cf0
--- /dev/null
+++ b/docs/source/Zh/doc/new_features/v192_features_doc.rst
@@ -0,0 +1,45 @@
+執行軌跡比較(兩次執行之間改變了什麼)
+======================================
+
+執行歷史告訴你某次執行*失敗*了,卻不告訴你相較於通過的那次*改變了什麼*:哪個步驟被加入或移除、
+哪個步驟由通過翻轉成失敗、哪個步驟變慢了。``run_diff`` 以最長共同子序列(LCS)走訪對齊兩個步驟
+序列——這樣插入或移除一個步驟會把其餘步驟順移到位,而非整個錯位配對——並將差異分類:
+
+* **added** / **removed** ——只存在於其中一次執行的步驟,
+* **status_flips** ——某個已對齊步驟的狀態改變,若帶有 ``error`` 則附上新失敗的
+  :func:`failure_signature`,
+* **timing_regressions** ——某個已對齊步驟變慢了 ``regress_factor`` 倍。
+
+步驟可為任何帶有名稱鍵(預設 ``"name"``)與選填 ``status`` / ``duration`` / ``error`` 的字典。
+純標準庫;不涉及裝置,不匯入 ``PySide6``。
+
+無頭 API
+--------
+
+.. code-block:: python
+
+    from je_auto_control import diff_runs, summarize_run_diff
+
+    before = [{"name": "login", "status": "ok", "duration": 1.0},
+              {"name": "submit", "status": "ok", "duration": 1.0}]
+    after = [{"name": "login", "status": "ok", "duration": 1.1},
+             {"name": "accept_cookies", "status": "ok"},          # 插入
+             {"name": "submit", "status": "error", "error": "Timeout ..."}]
+
+    diff = diff_runs(before, after)
+    # {"added": [accept_cookies], "removed": [],
+    #  "status_flips": [{"name": "submit", "from": "ok", "to": "error",
+    #                    "signature": "..."}],
+    #  "timing_regressions": [], "aligned": 2, "identical": False}
+
+    summarize_run_diff(diff)        # "+1 added, 1 status flip(s)"
+
+``regress_factor``(預設 ``1.5``)是算作退化的變慢比率;``key`` 選擇步驟對齊所依據的欄位。
+``summarize_run_diff`` 產生一行摘要(相同時為 ``"no change"``)。
+
+執行器指令
+----------
+
+``AC_diff_runs``(``before`` / ``after`` / ``key`` / ``regress_factor``)回傳該差異並附帶
+``summary`` 欄位。以唯讀 ``ac_diff_runs`` MCP 工具及 Script Builder 指令(位於 **Testing**
+分類下)形式提供。
diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py
index 446a003b..35482895 100644
--- a/je_auto_control/__init__.py
+++ b/je_auto_control/__init__.py
@@ -92,6 +92,8 @@
 from je_auto_control.utils.failure_signature import (
     failure_signature, group_failures, normalize_error,
 )
+# Run-trace diff (LCS-aligned: added/removed steps, status flips, regressions)
+from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff
 # VLM element locator (headless)
 from je_auto_control.utils.vision import (
     VLMNotAvailableError, click_by_description, locate_by_description,
@@ -1670,6 +1672,7 @@ def start_autocontrol_gui(*args, **kwargs):
     "detect_scale", "scale_sweep",
     "saliency_map", "salient_regions", "most_salient",
     "normalize_error", "failure_signature", "group_failures",
+    "diff_runs", "summarize_run_diff",
     # VLM locator
     "VLMNotAvailableError", "locate_by_description", "click_by_description",
     "verify_description",
diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py
index 1ebbb836..9dc991d9 100644
--- a/je_auto_control/gui/script_builder/command_schema.py
+++ b/je_auto_control/gui/script_builder/command_schema.py
@@ -2723,6 +2723,19 @@ def _add_audit_specs(specs: List[CommandSpec]) -> None:
                           placeholder='["err one", "err two"]'),),
         description="Group error messages by failure signature (most frequent).",
     ))
+    specs.append(CommandSpec(
+        "AC_diff_runs", "Testing", "Diff Two Run Traces",
+        fields=(
+            FieldSpec("before", FieldType.STRING,
+                      placeholder='[{"name": "login", "status": "ok"}]'),
+            FieldSpec("after", FieldType.STRING,
+                      placeholder='[{"name": "login", "status": "error"}]'),
+            FieldSpec("key", FieldType.STRING, optional=True, default="name"),
+            FieldSpec("regress_factor", FieldType.FLOAT, optional=True,
+                      default=1.5),
+        ),
+        description="LCS-align two run step-traces: added/removed/flips/regress.",
+    ))
     specs.append(CommandSpec(
         "AC_scan_secrets", "Tools", "Scan for Hardcoded Secrets",
         description="Scan 'data' (JSON view) for hardcoded secrets that "
diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py
index 304a9729..4c1706c6 100644
--- a/je_auto_control/utils/executor/action_executor.py
+++ b/je_auto_control/utils/executor/action_executor.py
@@ -4366,6 +4366,20 @@ def _group_failures(errors: Any) -> Dict[str, Any]:
     return {"groups": groups, "count": len(groups)}
 
 
+def _diff_runs(before: Any, after: Any, key: str = "name",
+               regress_factor: Any = 1.5) -> Dict[str, Any]:
+    """Adapter: diff two run step-traces (added/removed/flips/regressions)."""
+    import json
+    from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff
+    if isinstance(before, str):
+        before = json.loads(before)
+    if isinstance(after, str):
+        after = json.loads(after)
+    diff = diff_runs(before, after, key=str(key),
+                     regress_factor=float(regress_factor))
+    return {**diff, "summary": summarize_run_diff(diff)}
+
+
 def _image_histogram(source: Any = None, bins: Any = 32, space: str = "hsv",
                      region: Any = None) -> Dict[str, Any]:
     """Adapter: per-channel colour histogram of an image / the screen."""
@@ -6596,6 +6610,7 @@ def __init__(self):
             "AC_most_salient": _most_salient,
             "AC_failure_signature": _failure_signature,
             "AC_group_failures": _group_failures,
+            "AC_diff_runs": _diff_runs,
             "AC_image_histogram": _image_histogram,
             "AC_histogram_changed": _histogram_changed,
             "AC_changed_regions": _changed_regions,
diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py
index 8a3ddfbc..a0934ecb 100644
--- a/je_auto_control/utils/mcp_server/tools/_factories.py
+++ b/je_auto_control/utils/mcp_server/tools/_factories.py
@@ -7675,6 +7675,22 @@ def flakiness_tools() -> List[MCPTool]:
             handler=h.group_failures,
             annotations=READ_ONLY,
         ),
+        MCPTool(
+            name="ac_diff_runs",
+            description=("Diff two run step-traces (lists of {name,status,"
+                         "duration,error}) — LCS-aligned so inserts shift rather "
+                         "than mis-pair. Returns {added, removed, status_flips "
+                         "(with failure signature), timing_regressions, aligned, "
+                         "identical, summary}. 'regress_factor' = slowdown ratio."),
+            input_schema=schema({
+                "before": {"type": "array", "items": {"type": "object"}},
+                "after": {"type": "array", "items": {"type": "object"}},
+                "key": {"type": "string"},
+                "regress_factor": {"type": "number"}},
+                required=["before", "after"]),
+            handler=h.diff_runs,
+            annotations=READ_ONLY,
+        ),
     ]
 
 
diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py
index 552793a8..0914842b 100644
--- a/je_auto_control/utils/mcp_server/tools/_handlers.py
+++ b/je_auto_control/utils/mcp_server/tools/_handlers.py
@@ -2553,6 +2553,11 @@ def group_failures(errors):
     return _group_failures(errors)
 
 
+def diff_runs(before, after, key="name", regress_factor=1.5):
+    from je_auto_control.utils.executor.action_executor import _diff_runs
+    return _diff_runs(before, after, key, regress_factor)
+
+
 def image_histogram(source=None, bins=32, space="hsv", region=None):
     from je_auto_control.utils.executor.action_executor import _image_histogram
     return _image_histogram(source, bins, space, region)
diff --git a/je_auto_control/utils/run_diff/__init__.py b/je_auto_control/utils/run_diff/__init__.py
new file mode 100644
index 00000000..05cee29c
--- /dev/null
+++ b/je_auto_control/utils/run_diff/__init__.py
@@ -0,0 +1,4 @@
+"""Diff two run traces (LCS-aligned step diff: added/removed/flips/regressions)."""
+from je_auto_control.utils.run_diff.run_diff import diff_runs, summarize_run_diff
+
+__all__ = ["diff_runs", "summarize_run_diff"]
diff --git a/je_auto_control/utils/run_diff/run_diff.py b/je_auto_control/utils/run_diff/run_diff.py
new file mode 100644
index 00000000..26b2f494
--- /dev/null
+++ b/je_auto_control/utils/run_diff/run_diff.py
@@ -0,0 +1,119 @@
+"""Diff two run traces — what changed between two executions of a flow.
+
+A run history tells you a run *failed*, but not *what changed* from the run that
+passed: which step was added or dropped, which step flipped pass→fail, which step
+got slower. ``run_diff`` aligns the two step sequences with a longest-common-
+subsequence walk (so an inserted / removed step shifts the rest into place instead
+of mis-pairing everything) and classifies the differences:
+
+* **added** / **removed** steps (present in only one run),
+* **status flips** (an aligned step whose status changed — with the new failure's
+  :func:`failure_signature` when it carries an ``error``),
+* **timing regressions** (an aligned step that got ``regress_factor`` x slower).
+
+A step is any dict with a name key (default ``"name"``) and optional ``status`` /
+``duration`` / ``error``. Pure standard library; no device, no ``PySide6``.
+"""
+from typing import Any, Dict, List, Sequence
+
+Step = Dict[str, Any]
+
+
+def _lcs_pairs(left: Sequence[str], right: Sequence[str]) -> List[tuple]:
+    """Return aligned ``(i, j)`` index pairs of the longest common subsequence."""
+    n, m = len(left), len(right)
+    table = [[0] * (m + 1) for _ in range(n + 1)]
+    for i in range(n - 1, -1, -1):
+        for j in range(m - 1, -1, -1):
+            if left[i] == right[j]:
+                table[i][j] = table[i + 1][j + 1] + 1
+            else:
+                table[i][j] = max(table[i + 1][j], table[i][j + 1])
+    pairs, i, j = [], 0, 0
+    while i < n and j < m:
+        if left[i] == right[j]:
+            pairs.append((i, j))
+            i, j = i + 1, j + 1
+        elif table[i + 1][j] >= table[i][j + 1]:
+            i += 1
+        else:
+            j += 1
+    return pairs
+
+
+def _status_flip(before: Step, after: Step, name: str) -> Dict[str, Any]:
+    """Build a status-flip record, attaching a signature for a new error."""
+    flip = {"name": name, "from": before.get("status"),
+            "to": after.get("status")}
+    error = after.get("error")
+    if error:
+        from je_auto_control.utils.failure_signature import failure_signature
+        flip["signature"] = failure_signature(str(error))
+    return flip
+
+
+def _regression(before: Step, after: Step, name: str,
+                factor: float) -> Dict[str, Any]:
+    """Return a timing-regression record, or ``{}`` if not a regression."""
+    prev, curr = before.get("duration"), after.get("duration")
+    if not isinstance(prev, (int, float)) or not isinstance(curr, (int, float)):
+        return {}
+    if prev > 0 and curr >= prev * float(factor):
+        return {"name": name, "before": float(prev), "after": float(curr),
+                "ratio": round(curr / prev, 3)}
+    return {}
+
+
+def _aligned_changes(before: Sequence[Step], after: Sequence[Step],
+                     names: Sequence[str], pairs: List[tuple],
+                     factor: float) -> tuple:
+    """Classify the LCS-aligned pairs into (status flips, timing regressions)."""
+    flips, regressions = [], []
+    for i, j in pairs:
+        if before[i].get("status") != after[j].get("status"):
+            flips.append(_status_flip(before[i], after[j], names[i]))
+        regression = _regression(before[i], after[j], names[i], factor)
+        if regression:
+            regressions.append(regression)
+    return flips, regressions
+
+
+def _keys(steps: Sequence[Step], key: str) -> List[str]:
+    return [str(step.get(key, "")) for step in steps]
+
+
+def _unmatched(steps: Sequence[Step], matched: set) -> List[Step]:
+    return [steps[k] for k in range(len(steps)) if k not in matched]
+
+
+def diff_runs(before: Sequence[Step], after: Sequence[Step], *,
+              key: str = "name", regress_factor: float = 1.5) -> Dict[str, Any]:
+    """Diff two step sequences into ``{added, removed, status_flips,
+    timing_regressions, aligned, identical}``.
+
+    Steps are aligned by their ``key`` value via LCS; ``regress_factor`` is the
+    slowdown ratio that counts as a timing regression.
+    """
+    left, right = _keys(before, key), _keys(after, key)
+    pairs = _lcs_pairs(left, right)
+    flips, regressions = _aligned_changes(before, after, left, pairs,
+                                          regress_factor)
+    added = _unmatched(after, {j for _, j in pairs})
+    removed = _unmatched(before, {i for i, _ in pairs})
+    return {"added": added, "removed": removed, "status_flips": flips,
+            "timing_regressions": regressions, "aligned": len(pairs),
+            "identical": not any((added, removed, flips, regressions))}
+
+
+def summarize_run_diff(diff: Dict[str, Any]) -> str:
+    """Render a one-line human summary of a :func:`diff_runs` result."""
+    if diff.get("identical"):
+        return "no change"
+    parts = []
+    for label, field in (("+{} added", "added"), ("-{} removed", "removed"),
+                         ("{} status flip(s)", "status_flips"),
+                         ("{} regression(s)", "timing_regressions")):
+        count = len(diff.get(field, []))
+        if count:
+            parts.append(label.format(count))
+    return ", ".join(parts)
diff --git a/test/unit_test/headless/test_run_diff_batch.py b/test/unit_test/headless/test_run_diff_batch.py
new file mode 100644
index 00000000..a705afdc
--- /dev/null
+++ b/test/unit_test/headless/test_run_diff_batch.py
@@ -0,0 +1,93 @@
+"""Headless tests for run-trace diffing (LCS-aligned step diff)."""
+import pytest
+
+import je_auto_control as ac
+from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff
+
+
+def _before():
+    return [
+        {"name": "login", "status": "ok", "duration": 1.0},
+        {"name": "open_form", "status": "ok", "duration": 2.0},
+        {"name": "submit", "status": "ok", "duration": 1.0},
+    ]
+
+
+def _after():
+    return [
+        {"name": "login", "status": "ok", "duration": 1.1},
+        {"name": "accept_cookies", "status": "ok", "duration": 0.5},   # inserted
+        {"name": "open_form", "status": "ok", "duration": 5.0},        # 2.5x slower
+        {"name": "submit", "status": "error", "duration": 1.0,
+         "error": r"Timeout at C:\app.py line 42 (0x7ff)"},           # flip
+    ]
+
+
+def test_lcs_alignment_isolates_the_insert():
+    diff = diff_runs(_before(), _after())
+    # the inserted step is the only add; login/open_form/submit stay aligned
+    assert [s["name"] for s in diff["added"]] == ["accept_cookies"]
+    assert diff["removed"] == []
+    assert diff["aligned"] == 3
+    assert diff["identical"] is False
+
+
+def test_status_flip_carries_failure_signature():
+    flips = diff_runs(_before(), _after())["status_flips"]
+    assert len(flips) == 1
+    assert flips[0]["name"] == "submit"
+    assert flips[0]["from"] == "ok" and flips[0]["to"] == "error"
+    assert len(flips[0]["signature"]) == 12       # failure_signature attached
+
+
+def test_timing_regression_detected_with_ratio():
+    regs = diff_runs(_before(), _after())["timing_regressions"]
+    assert len(regs) == 1 and regs[0]["name"] == "open_form"
+    assert regs[0]["ratio"] == pytest.approx(2.5)
+    # a small slowdown under the factor is not a regression
+    slow = diff_runs([{"name": "a", "duration": 1.0}],
+                     [{"name": "a", "duration": 1.2}])
+    assert slow["timing_regressions"] == []
+
+
+def test_removed_step_detected():
+    diff = diff_runs(_before(), [_before()[0], _before()[2]])  # drop open_form
+    assert [s["name"] for s in diff["removed"]] == ["open_form"]
+    assert diff["added"] == []
+
+
+def test_identical_runs():
+    diff = diff_runs(_before(), _before())
+    assert diff["identical"] is True
+    assert summarize_run_diff(diff) == "no change"
+
+
+def test_summary_lists_changes():
+    summary = summarize_run_diff(diff_runs(_before(), _after()))
+    assert "added" in summary and "flip" in summary and "regression" in summary
+
+
+# --- wiring ---------------------------------------------------------------
+
+def test_executor_path_includes_summary():
+    import json
+    from je_auto_control.utils.executor.action_executor import _diff_runs
+    out = _diff_runs(json.dumps(_before()), json.dumps(_after()))
+    assert out["aligned"] == 3
+    assert out["summary"] == summarize_run_diff(diff_runs(_before(), _after()))
+
+
+def test_wiring():
+    known = set(ac.executor.known_commands())
+    assert "AC_diff_runs" in known
+    from je_auto_control.utils.mcp_server.tools import build_default_tool_registry
+    names = {t.name for t in build_default_tool_registry()}
+    assert "ac_diff_runs" in names
+    from je_auto_control.gui.script_builder.command_schema import _build_specs
+    specs = {s.command for s in _build_specs()}
+    assert "AC_diff_runs" in specs
+
+
+def test_facade_exports():
+    for name in ("diff_runs", "summarize_run_diff"):
+        assert hasattr(ac, name) and name in ac.__all__