From 3fd8b686d9e52117a51f26c866488db99a302759 Mon Sep 17 00:00:00 2001
From: JeffreyChen <zenxcvwait@gmail.com>
Date: Thu, 25 Jun 2026 01:49:53 +0800
Subject: [PATCH] Add flake_cluster: cluster tests that flake together
 (co-failure Jaccard)

Flaky tests are rarely independent - a wobbly fixture or noisy
dependency makes a group fail in the same runs (~75% of flaky tests
cluster). Ranking tests one-by-one by flip rate misses that shared root
cause. Measure how often each pair fails in the same runs (Jaccard over
their failing-run sets) and group tests above a threshold into connected
clusters with a cohesion score. Pure stdlib over a list of failed-test
sets.
---
 WHATS_NEW.md                                  |  6 ++
 .../doc/new_features/v193_features_doc.rst    | 46 +++++++++
 .../Zh/doc/new_features/v193_features_doc.rst | 41 ++++++++
 je_auto_control/__init__.py                   |  3 +
 .../gui/script_builder/command_schema.py      | 19 ++++
 .../utils/executor/action_executor.py         | 24 +++++
 .../utils/flake_cluster/__init__.py           |  6 ++
 .../utils/flake_cluster/flake_cluster.py      | 97 +++++++++++++++++++
 .../utils/mcp_server/tools/_factories.py      | 29 ++++++
 .../utils/mcp_server/tools/_handlers.py       | 10 ++
 .../headless/test_flake_cluster_batch.py      | 83 ++++++++++++++++
 11 files changed, 364 insertions(+)
 create mode 100644 docs/source/Eng/doc/new_features/v193_features_doc.rst
 create mode 100644 docs/source/Zh/doc/new_features/v193_features_doc.rst
 create mode 100644 je_auto_control/utils/flake_cluster/__init__.py
 create mode 100644 je_auto_control/utils/flake_cluster/flake_cluster.py
 create mode 100644 test/unit_test/headless/test_flake_cluster_batch.py

diff --git a/WHATS_NEW.md b/WHATS_NEW.md
index b5e8ffbd..3789042d 100644
--- a/WHATS_NEW.md
+++ b/WHATS_NEW.md
@@ -1,5 +1,11 @@
 # What's New — AutoControl
 
+## What's new (2026-06-25) — Flaky-Test Co-Failure Clustering
+
+Find the tests that flake *together* — and the shared root cause behind them. Full reference: [`docs/source/Eng/doc/new_features/v193_features_doc.rst`](docs/source/Eng/doc/new_features/v193_features_doc.rst).
+
+- **`cofailure_pairs` / `failure_clusters`** (`AC_cofailure_pairs`, `AC_failure_clusters`): flaky tests are rarely independent — a wobbly fixture or noisy dependency makes a *group* fail in the same runs (~75% of flaky tests cluster). Ranking tests one-by-one by flip rate misses that. This measures how often each pair of tests fails in the *same* runs (Jaccard over their failing-run sets) and groups tests above a threshold into connected clusters with a cohesion score — so you chase one root cause instead of N symptoms. Input is a list of runs, each the test names that failed in it. Pure stdlib. No `PySide6`.
+
 ## What's new (2026-06-25) — Run-Trace Diff (what changed between two executions)
 
 See exactly what changed between a passing run and a failing one. Full reference: [`docs/source/Eng/doc/new_features/v192_features_doc.rst`](docs/source/Eng/doc/new_features/v192_features_doc.rst).
diff --git a/docs/source/Eng/doc/new_features/v193_features_doc.rst b/docs/source/Eng/doc/new_features/v193_features_doc.rst
new file mode 100644
index 00000000..3abad25e
--- /dev/null
+++ b/docs/source/Eng/doc/new_features/v193_features_doc.rst
@@ -0,0 +1,46 @@
+Flaky-Test Co-Failure Clustering
+================================
+
+Flaky tests are rarely independent: a wobbly shared fixture, a slow dependency or
+a noisy environment makes a *group* of tests fail in the same runs (research finds
+~75% of flaky tests fall into co-failure clusters). Ranking tests one-by-one by
+flip rate misses that shared root cause. ``flake_cluster`` measures how often each
+pair of tests fails in the *same* runs — Jaccard similarity over the set of runs
+each failed in — and groups tests whose co-failure exceeds a threshold, so you can
+chase one root cause instead of N symptoms.
+
+* :func:`cofailure_pairs` — test pairs that fail together above a threshold,
+* :func:`failure_clusters` — connected clusters of co-failing tests with a
+  cohesion score (mean pairwise Jaccard).
+
+Input is a list of runs, each a collection of the test names that failed in that
+run. Pure standard library; no device, no ``PySide6``.
+
+Headless API
+------------
+
+.. code-block:: python
+
+    from je_auto_control import failure_clusters, cofailure_pairs
+
+    runs = [["test_a", "test_b"],            # both failed in this run
+            ["test_a", "test_b"],
+            ["test_c"],
+            ["test_a", "test_b", "test_c"]]
+
+    failure_clusters(runs, threshold=0.6)
+    # [{"tests": ["test_a", "test_b"], "size": 2, "cohesion": 1.0}]
+
+    cofailure_pairs(runs, threshold=0.6)
+    # [{"tests": ["test_a", "test_b"], "jaccard": 1.0, "co_failures": 3}]
+
+``threshold`` is the minimum co-failure Jaccard to link two tests; ``min_size``
+(default ``2``) drops singletons so only genuine clusters surface. Clusters come
+back largest / most cohesive first.
+
+Executor commands
+-----------------
+
+``AC_failure_clusters`` (``runs`` / ``threshold`` / ``min_size``) and
+``AC_cofailure_pairs`` (``runs`` / ``threshold``). They are exposed as read-only
+``ac_*`` MCP tools and as Script Builder commands under **Testing**.
diff --git a/docs/source/Zh/doc/new_features/v193_features_doc.rst b/docs/source/Zh/doc/new_features/v193_features_doc.rst
new file mode 100644
index 00000000..222fc3a7
--- /dev/null
+++ b/docs/source/Zh/doc/new_features/v193_features_doc.rst
@@ -0,0 +1,41 @@
+不穩定測試的共同失敗分群
+========================
+
+不穩定(flaky)測試很少是獨立的:搖晃的共用 fixture、緩慢的相依、或吵雜的環境,會讓*一群*測試在
+相同的執行中一起失敗(研究發現約 75% 的 flaky 測試落在共同失敗的群集裡)。逐一以翻轉率排名測試
+會錯過這個共同根因。``flake_cluster`` 量測每對測試多常在*相同*執行中失敗——即各自失敗的執行集合
+之間的 Jaccard 相似度——並把共同失敗超過門檻的測試分群,讓你能追一個根因,而非 N 個症狀。
+
+* :func:`cofailure_pairs` ——共同失敗超過門檻的測試對,
+* :func:`failure_clusters` ——共同失敗測試的連通群集,附凝聚度分數(群內平均成對 Jaccard)。
+
+輸入是一份執行清單,每個元素為該次執行中失敗的測試名稱集合。純標準庫;不涉及裝置,不匯入
+``PySide6``。
+
+無頭 API
+--------
+
+.. code-block:: python
+
+    from je_auto_control import failure_clusters, cofailure_pairs
+
+    runs = [["test_a", "test_b"],            # 這次執行兩者皆失敗
+            ["test_a", "test_b"],
+            ["test_c"],
+            ["test_a", "test_b", "test_c"]]
+
+    failure_clusters(runs, threshold=0.6)
+    # [{"tests": ["test_a", "test_b"], "size": 2, "cohesion": 1.0}]
+
+    cofailure_pairs(runs, threshold=0.6)
+    # [{"tests": ["test_a", "test_b"], "jaccard": 1.0, "co_failures": 3}]
+
+``threshold`` 是連結兩測試所需的最小共同失敗 Jaccard;``min_size``(預設 ``2``)會丟棄單例,
+讓只有真正的群集浮現。群集以最大 / 最凝聚者在前回傳。
+
+執行器指令
+----------
+
+``AC_failure_clusters``(``runs`` / ``threshold`` / ``min_size``)與
+``AC_cofailure_pairs``(``runs`` / ``threshold``)。皆以唯讀 ``ac_*`` MCP 工具及 Script Builder
+指令(位於 **Testing** 分類下)形式提供。
diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py
index 35482895..39c53d8a 100644
--- a/je_auto_control/__init__.py
+++ b/je_auto_control/__init__.py
@@ -94,6 +94,8 @@
 )
 # Run-trace diff (LCS-aligned: added/removed steps, status flips, regressions)
 from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff
+# Flaky-test co-failure clustering (Jaccard over shared failing runs)
+from je_auto_control.utils.flake_cluster import cofailure_pairs, failure_clusters
 # VLM element locator (headless)
 from je_auto_control.utils.vision import (
     VLMNotAvailableError, click_by_description, locate_by_description,
@@ -1673,6 +1675,7 @@ def start_autocontrol_gui(*args, **kwargs):
     "saliency_map", "salient_regions", "most_salient",
     "normalize_error", "failure_signature", "group_failures",
     "diff_runs", "summarize_run_diff",
+    "cofailure_pairs", "failure_clusters",
     # VLM locator
     "VLMNotAvailableError", "locate_by_description", "click_by_description",
     "verify_description",
diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py
index 9dc991d9..b26d514f 100644
--- a/je_auto_control/gui/script_builder/command_schema.py
+++ b/je_auto_control/gui/script_builder/command_schema.py
@@ -2736,6 +2736,25 @@ def _add_audit_specs(specs: List[CommandSpec]) -> None:
         ),
         description="LCS-align two run step-traces: added/removed/flips/regress.",
     ))
+    specs.append(CommandSpec(
+        "AC_failure_clusters", "Testing", "Cluster Co-Failing Tests",
+        fields=(
+            FieldSpec("runs", FieldType.STRING,
+                      placeholder='[["test_a", "test_b"], ["test_a", "test_b"]]'),
+            FieldSpec("threshold", FieldType.FLOAT, optional=True, default=0.5),
+            FieldSpec("min_size", FieldType.INT, optional=True, default=2),
+        ),
+        description="Cluster tests that flake together (co-failure Jaccard).",
+    ))
+    specs.append(CommandSpec(
+        "AC_cofailure_pairs", "Testing", "Co-Failing Test Pairs",
+        fields=(
+            FieldSpec("runs", FieldType.STRING,
+                      placeholder='[["test_a", "test_b"]]'),
+            FieldSpec("threshold", FieldType.FLOAT, optional=True, default=0.5),
+        ),
+        description="Test pairs that fail together above a Jaccard threshold.",
+    ))
     specs.append(CommandSpec(
         "AC_scan_secrets", "Tools", "Scan for Hardcoded Secrets",
         description="Scan 'data' (JSON view) for hardcoded secrets that "
diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py
index 4c1706c6..377cb4f9 100644
--- a/je_auto_control/utils/executor/action_executor.py
+++ b/je_auto_control/utils/executor/action_executor.py
@@ -4380,6 +4380,28 @@ def _diff_runs(before: Any, after: Any, key: str = "name",
     return {**diff, "summary": summarize_run_diff(diff)}
 
 
+def _failure_clusters(runs: Any, threshold: Any = 0.5,
+                      min_size: Any = 2) -> Dict[str, Any]:
+    """Adapter: cluster tests that fail together (co-failure Jaccard)."""
+    import json
+    from je_auto_control.utils.flake_cluster import failure_clusters
+    if isinstance(runs, str):
+        runs = json.loads(runs)
+    clusters = failure_clusters(runs, threshold=float(threshold),
+                                min_size=int(min_size))
+    return {"clusters": clusters, "count": len(clusters)}
+
+
+def _cofailure_pairs(runs: Any, threshold: Any = 0.5) -> Dict[str, Any]:
+    """Adapter: test pairs that fail together above a Jaccard threshold."""
+    import json
+    from je_auto_control.utils.flake_cluster import cofailure_pairs
+    if isinstance(runs, str):
+        runs = json.loads(runs)
+    pairs = cofailure_pairs(runs, threshold=float(threshold))
+    return {"pairs": pairs, "count": len(pairs)}
+
+
 def _image_histogram(source: Any = None, bins: Any = 32, space: str = "hsv",
                      region: Any = None) -> Dict[str, Any]:
     """Adapter: per-channel colour histogram of an image / the screen."""
@@ -6611,6 +6633,8 @@ def __init__(self):
             "AC_failure_signature": _failure_signature,
             "AC_group_failures": _group_failures,
             "AC_diff_runs": _diff_runs,
+            "AC_failure_clusters": _failure_clusters,
+            "AC_cofailure_pairs": _cofailure_pairs,
             "AC_image_histogram": _image_histogram,
             "AC_histogram_changed": _histogram_changed,
             "AC_changed_regions": _changed_regions,
diff --git a/je_auto_control/utils/flake_cluster/__init__.py b/je_auto_control/utils/flake_cluster/__init__.py
new file mode 100644
index 00000000..376dbb05
--- /dev/null
+++ b/je_auto_control/utils/flake_cluster/__init__.py
@@ -0,0 +1,6 @@
+"""Cluster tests that flake together by co-failure Jaccard similarity."""
+from je_auto_control.utils.flake_cluster.flake_cluster import (
+    cofailure_pairs, failure_clusters,
+)
+
+__all__ = ["cofailure_pairs", "failure_clusters"]
diff --git a/je_auto_control/utils/flake_cluster/flake_cluster.py b/je_auto_control/utils/flake_cluster/flake_cluster.py
new file mode 100644
index 00000000..d35d261c
--- /dev/null
+++ b/je_auto_control/utils/flake_cluster/flake_cluster.py
@@ -0,0 +1,97 @@
+"""Cluster tests that flake *together* by co-failure similarity.
+
+Flaky tests are rarely independent: a wobbly shared fixture, a slow dependency or
+a noisy environment makes a *group* of tests fail in the same runs (research finds
+~75% of flaky tests fall into co-failure clusters). Ranking tests one-by-one by
+flip rate misses that shared root cause. ``flake_cluster`` measures how often each
+pair of tests fails in the *same* runs (Jaccard similarity over the set of runs
+each failed in) and groups tests whose co-failure exceeds a threshold into
+clusters — so you can chase one root cause instead of N symptoms.
+
+Input is a list of runs, each a collection of the test names that failed in that
+run. Pure standard library; no device, no ``PySide6``.
+"""
+from itertools import combinations
+from typing import Any, Dict, List, Sequence, Set
+
+
+def _set_jaccard(left: Set[int], right: Set[int]) -> float:
+    union = left | right
+    return len(left & right) / len(union) if union else 0.0
+
+
+def _fail_runs(runs: Sequence[Sequence[str]]) -> Dict[str, Set[int]]:
+    """Map each test name to the set of run indices in which it failed."""
+    fails: Dict[str, Set[int]] = {}
+    for index, run in enumerate(runs):
+        for test in set(run):
+            fails.setdefault(str(test), set()).add(index)
+    return fails
+
+
+def cofailure_pairs(runs: Sequence[Sequence[str]], *,
+                    threshold: float = 0.5) -> List[Dict[str, Any]]:
+    """Return test pairs whose co-failure Jaccard meets ``threshold``.
+
+    Each entry is ``{tests:[a,b], jaccard, co_failures}``, most similar first.
+    """
+    fails = _fail_runs(runs)
+    pairs = []
+    for left, right in combinations(sorted(fails), 2):
+        score = _set_jaccard(fails[left], fails[right])
+        if score >= float(threshold):
+            pairs.append({"tests": [left, right], "jaccard": round(score, 3),
+                          "co_failures": len(fails[left] & fails[right])})
+    pairs.sort(key=lambda pair: pair["jaccard"], reverse=True)
+    return pairs
+
+
+def _connected_components(nodes: Sequence[str],
+                          adjacency: Dict[str, Set[str]]) -> List[List[str]]:
+    seen: Set[str] = set()
+    components = []
+    for node in nodes:
+        if node in seen:
+            continue
+        stack, component = [node], []
+        while stack:
+            current = stack.pop()
+            if current in seen:
+                continue
+            seen.add(current)
+            component.append(current)
+            stack.extend(adjacency[current] - seen)
+        components.append(component)
+    return components
+
+
+def _cohesion(component: Sequence[str], fails: Dict[str, Set[int]]) -> float:
+    scores = [_set_jaccard(fails[a], fails[b])
+              for a, b in combinations(component, 2)]
+    return round(sum(scores) / len(scores), 3) if scores else 1.0
+
+
+def failure_clusters(runs: Sequence[Sequence[str]], *, threshold: float = 0.5,
+                     min_size: int = 2) -> List[Dict[str, Any]]:
+    """Group tests that fail together into co-failure clusters.
+
+    Builds a graph linking test pairs whose co-failure Jaccard meets
+    ``threshold``, then returns its connected components of at least ``min_size``
+    tests as ``[{tests, size, cohesion}]`` (largest / most cohesive first).
+    ``cohesion`` is the mean pairwise Jaccard within the cluster.
+    """
+    fails = _fail_runs(runs)
+    tests = sorted(fails)
+    adjacency: Dict[str, Set[str]] = {test: set() for test in tests}
+    for left, right in combinations(tests, 2):
+        if _set_jaccard(fails[left], fails[right]) >= float(threshold):
+            adjacency[left].add(right)
+            adjacency[right].add(left)
+    clusters = []
+    for component in _connected_components(tests, adjacency):
+        if len(component) >= int(min_size):
+            clusters.append({"tests": sorted(component), "size": len(component),
+                             "cohesion": _cohesion(component, fails)})
+    clusters.sort(key=lambda cluster: (cluster["size"], cluster["cohesion"]),
+                  reverse=True)
+    return clusters
diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py
index a0934ecb..95752986 100644
--- a/je_auto_control/utils/mcp_server/tools/_factories.py
+++ b/je_auto_control/utils/mcp_server/tools/_factories.py
@@ -7691,6 +7691,35 @@ def flakiness_tools() -> List[MCPTool]:
             handler=h.diff_runs,
             annotations=READ_ONLY,
         ),
+        MCPTool(
+            name="ac_failure_clusters",
+            description=("Cluster tests that flake TOGETHER: 'runs' is a list of "
+                         "runs, each the list of test names that failed in that "
+                         "run. Groups tests whose co-failure Jaccard >= "
+                         "'threshold'. Returns {clusters:[{tests,size,cohesion}], "
+                         "count} — chase one root cause, not N symptoms."),
+            input_schema=schema({
+                "runs": {"type": "array",
+                         "items": {"type": "array", "items": {"type": "string"}}},
+                "threshold": {"type": "number"},
+                "min_size": {"type": "integer"}},
+                required=["runs"]),
+            handler=h.failure_clusters,
+            annotations=READ_ONLY,
+        ),
+        MCPTool(
+            name="ac_cofailure_pairs",
+            description=("Test pairs that fail together above a Jaccard "
+                         "'threshold' over shared failing runs: "
+                         "{pairs:[{tests,jaccard,co_failures}], count}."),
+            input_schema=schema({
+                "runs": {"type": "array",
+                         "items": {"type": "array", "items": {"type": "string"}}},
+                "threshold": {"type": "number"}},
+                required=["runs"]),
+            handler=h.cofailure_pairs,
+            annotations=READ_ONLY,
+        ),
     ]
 
 
diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py
index 0914842b..34319fcd 100644
--- a/je_auto_control/utils/mcp_server/tools/_handlers.py
+++ b/je_auto_control/utils/mcp_server/tools/_handlers.py
@@ -2558,6 +2558,16 @@ def diff_runs(before, after, key="name", regress_factor=1.5):
     return _diff_runs(before, after, key, regress_factor)
 
 
+def failure_clusters(runs, threshold=0.5, min_size=2):
+    from je_auto_control.utils.executor.action_executor import _failure_clusters
+    return _failure_clusters(runs, threshold, min_size)
+
+
+def cofailure_pairs(runs, threshold=0.5):
+    from je_auto_control.utils.executor.action_executor import _cofailure_pairs
+    return _cofailure_pairs(runs, threshold)
+
+
 def image_histogram(source=None, bins=32, space="hsv", region=None):
     from je_auto_control.utils.executor.action_executor import _image_histogram
     return _image_histogram(source, bins, space, region)
diff --git a/test/unit_test/headless/test_flake_cluster_batch.py b/test/unit_test/headless/test_flake_cluster_batch.py
new file mode 100644
index 00000000..e415668e
--- /dev/null
+++ b/test/unit_test/headless/test_flake_cluster_batch.py
@@ -0,0 +1,83 @@
+"""Headless tests for co-failure flake clustering (Jaccard over failing runs)."""
+import pytest
+
+import je_auto_control as ac
+from je_auto_control.utils.flake_cluster import cofailure_pairs, failure_clusters
+
+
+def _runs():
+    # a&b always fail together; c&d always together; e fails alone
+    return [
+        ["a", "b"],
+        ["a", "b"],
+        ["c", "d"],
+        ["a", "b", "c", "d"],
+        ["e"],
+        ["c", "d"],
+    ]
+
+
+def test_clusters_group_cofailing_tests():
+    clusters = failure_clusters(_runs(), threshold=0.6)
+    grouped = sorted(tuple(c["tests"]) for c in clusters)
+    assert grouped == [("a", "b"), ("c", "d")]
+    assert all(c["cohesion"] == pytest.approx(1.0) for c in clusters)
+    assert all(c["size"] == 2 for c in clusters)
+
+
+def test_singleton_excluded_by_min_size():
+    # 'e' never co-fails, so it is not in any cluster of size >= 2
+    tests_in_clusters = {t for c in failure_clusters(_runs()) for t in c["tests"]}
+    assert "e" not in tests_in_clusters
+
+
+def test_min_size_includes_singletons_when_one():
+    clusters = failure_clusters([["e"]], threshold=0.5, min_size=1)
+    assert clusters == [{"tests": ["e"], "size": 1, "cohesion": 1.0}]
+
+
+def test_high_threshold_keeps_only_perfect_cofailure():
+    # a&b co-fail perfectly (jaccard 1.0); a&c only sometimes
+    clusters = failure_clusters(_runs(), threshold=0.95)
+    assert sorted(tuple(c["tests"]) for c in clusters) == [("a", "b"), ("c", "d")]
+
+
+def test_cofailure_pairs_scores_and_sorted():
+    pairs = cofailure_pairs(_runs(), threshold=0.6)
+    assert {tuple(p["tests"]) for p in pairs} == {("a", "b"), ("c", "d")}
+    assert pairs[0]["jaccard"] == pytest.approx(1.0)
+    assert pairs[0]["co_failures"] == 3
+
+
+def test_empty_and_no_cofailure():
+    assert failure_clusters([]) == []
+    assert failure_clusters([["x"], ["y"], ["z"]]) == []   # nobody co-fails
+
+
+# --- wiring ---------------------------------------------------------------
+
+def test_executor_paths():
+    import json
+    from je_auto_control.utils.executor.action_executor import (
+        _cofailure_pairs, _failure_clusters)
+    runs_json = json.dumps(_runs())
+    clusters = _failure_clusters(runs_json, threshold=0.6)
+    assert clusters["count"] == 2
+    pairs = _cofailure_pairs(runs_json, threshold=0.6)
+    assert pairs["count"] == 2
+
+
+def test_wiring():
+    known = set(ac.executor.known_commands())
+    assert {"AC_failure_clusters", "AC_cofailure_pairs"} <= known
+    from je_auto_control.utils.mcp_server.tools import build_default_tool_registry
+    names = {t.name for t in build_default_tool_registry()}
+    assert {"ac_failure_clusters", "ac_cofailure_pairs"} <= names
+    from je_auto_control.gui.script_builder.command_schema import _build_specs
+    specs = {s.command for s in _build_specs()}
+    assert {"AC_failure_clusters", "AC_cofailure_pairs"} <= specs
+
+
+def test_facade_exports():
+    for name in ("cofailure_pairs", "failure_clusters"):
+        assert hasattr(ac, name) and name in ac.__all__