From 3fd8b686d9e52117a51f26c866488db99a302759 Mon Sep 17 00:00:00 2001 From: JeffreyChen Date: Thu, 25 Jun 2026 01:49:53 +0800 Subject: [PATCH] Add flake_cluster: cluster tests that flake together (co-failure Jaccard) Flaky tests are rarely independent - a wobbly fixture or noisy dependency makes a group fail in the same runs (~75% of flaky tests cluster). Ranking tests one-by-one by flip rate misses that shared root cause. Measure how often each pair fails in the same runs (Jaccard over their failing-run sets) and group tests above a threshold into connected clusters with a cohesion score. Pure stdlib over a list of failed-test sets. --- WHATS_NEW.md | 6 ++ .../doc/new_features/v193_features_doc.rst | 46 +++++++++ .../Zh/doc/new_features/v193_features_doc.rst | 41 ++++++++ je_auto_control/__init__.py | 3 + .../gui/script_builder/command_schema.py | 19 ++++ .../utils/executor/action_executor.py | 24 +++++ .../utils/flake_cluster/__init__.py | 6 ++ .../utils/flake_cluster/flake_cluster.py | 97 +++++++++++++++++++ .../utils/mcp_server/tools/_factories.py | 29 ++++++ .../utils/mcp_server/tools/_handlers.py | 10 ++ .../headless/test_flake_cluster_batch.py | 83 ++++++++++++++++ 11 files changed, 364 insertions(+) create mode 100644 docs/source/Eng/doc/new_features/v193_features_doc.rst create mode 100644 docs/source/Zh/doc/new_features/v193_features_doc.rst create mode 100644 je_auto_control/utils/flake_cluster/__init__.py create mode 100644 je_auto_control/utils/flake_cluster/flake_cluster.py create mode 100644 test/unit_test/headless/test_flake_cluster_batch.py diff --git a/WHATS_NEW.md b/WHATS_NEW.md index b5e8ffbd..3789042d 100644 --- a/WHATS_NEW.md +++ b/WHATS_NEW.md @@ -1,5 +1,11 @@ # What's New — AutoControl +## What's new (2026-06-25) — Flaky-Test Co-Failure Clustering + +Find the tests that flake *together* — and the shared root cause behind them. Full reference: [`docs/source/Eng/doc/new_features/v193_features_doc.rst`](docs/source/Eng/doc/new_features/v193_features_doc.rst). + +- **`cofailure_pairs` / `failure_clusters`** (`AC_cofailure_pairs`, `AC_failure_clusters`): flaky tests are rarely independent — a wobbly fixture or noisy dependency makes a *group* fail in the same runs (~75% of flaky tests cluster). Ranking tests one-by-one by flip rate misses that. This measures how often each pair of tests fails in the *same* runs (Jaccard over their failing-run sets) and groups tests above a threshold into connected clusters with a cohesion score — so you chase one root cause instead of N symptoms. Input is a list of runs, each the test names that failed in it. Pure stdlib. No `PySide6`. + ## What's new (2026-06-25) — Run-Trace Diff (what changed between two executions) See exactly what changed between a passing run and a failing one. Full reference: [`docs/source/Eng/doc/new_features/v192_features_doc.rst`](docs/source/Eng/doc/new_features/v192_features_doc.rst). diff --git a/docs/source/Eng/doc/new_features/v193_features_doc.rst b/docs/source/Eng/doc/new_features/v193_features_doc.rst new file mode 100644 index 00000000..3abad25e --- /dev/null +++ b/docs/source/Eng/doc/new_features/v193_features_doc.rst @@ -0,0 +1,46 @@ +Flaky-Test Co-Failure Clustering +================================ + +Flaky tests are rarely independent: a wobbly shared fixture, a slow dependency or +a noisy environment makes a *group* of tests fail in the same runs (research finds +~75% of flaky tests fall into co-failure clusters). Ranking tests one-by-one by +flip rate misses that shared root cause. ``flake_cluster`` measures how often each +pair of tests fails in the *same* runs — Jaccard similarity over the set of runs +each failed in — and groups tests whose co-failure exceeds a threshold, so you can +chase one root cause instead of N symptoms. + +* :func:`cofailure_pairs` — test pairs that fail together above a threshold, +* :func:`failure_clusters` — connected clusters of co-failing tests with a + cohesion score (mean pairwise Jaccard). + +Input is a list of runs, each a collection of the test names that failed in that +run. Pure standard library; no device, no ``PySide6``. + +Headless API +------------ + +.. code-block:: python + + from je_auto_control import failure_clusters, cofailure_pairs + + runs = [["test_a", "test_b"], # both failed in this run + ["test_a", "test_b"], + ["test_c"], + ["test_a", "test_b", "test_c"]] + + failure_clusters(runs, threshold=0.6) + # [{"tests": ["test_a", "test_b"], "size": 2, "cohesion": 1.0}] + + cofailure_pairs(runs, threshold=0.6) + # [{"tests": ["test_a", "test_b"], "jaccard": 1.0, "co_failures": 3}] + +``threshold`` is the minimum co-failure Jaccard to link two tests; ``min_size`` +(default ``2``) drops singletons so only genuine clusters surface. Clusters come +back largest / most cohesive first. + +Executor commands +----------------- + +``AC_failure_clusters`` (``runs`` / ``threshold`` / ``min_size``) and +``AC_cofailure_pairs`` (``runs`` / ``threshold``). They are exposed as read-only +``ac_*`` MCP tools and as Script Builder commands under **Testing**. diff --git a/docs/source/Zh/doc/new_features/v193_features_doc.rst b/docs/source/Zh/doc/new_features/v193_features_doc.rst new file mode 100644 index 00000000..222fc3a7 --- /dev/null +++ b/docs/source/Zh/doc/new_features/v193_features_doc.rst @@ -0,0 +1,41 @@ +不穩定測試的共同失敗分群 +======================== + +不穩定(flaky)測試很少是獨立的:搖晃的共用 fixture、緩慢的相依、或吵雜的環境,會讓*一群*測試在 +相同的執行中一起失敗(研究發現約 75% 的 flaky 測試落在共同失敗的群集裡)。逐一以翻轉率排名測試 +會錯過這個共同根因。``flake_cluster`` 量測每對測試多常在*相同*執行中失敗——即各自失敗的執行集合 +之間的 Jaccard 相似度——並把共同失敗超過門檻的測試分群,讓你能追一個根因,而非 N 個症狀。 + +* :func:`cofailure_pairs` ——共同失敗超過門檻的測試對, +* :func:`failure_clusters` ——共同失敗測試的連通群集,附凝聚度分數(群內平均成對 Jaccard)。 + +輸入是一份執行清單,每個元素為該次執行中失敗的測試名稱集合。純標準庫;不涉及裝置,不匯入 +``PySide6``。 + +無頭 API +-------- + +.. code-block:: python + + from je_auto_control import failure_clusters, cofailure_pairs + + runs = [["test_a", "test_b"], # 這次執行兩者皆失敗 + ["test_a", "test_b"], + ["test_c"], + ["test_a", "test_b", "test_c"]] + + failure_clusters(runs, threshold=0.6) + # [{"tests": ["test_a", "test_b"], "size": 2, "cohesion": 1.0}] + + cofailure_pairs(runs, threshold=0.6) + # [{"tests": ["test_a", "test_b"], "jaccard": 1.0, "co_failures": 3}] + +``threshold`` 是連結兩測試所需的最小共同失敗 Jaccard;``min_size``(預設 ``2``)會丟棄單例, +讓只有真正的群集浮現。群集以最大 / 最凝聚者在前回傳。 + +執行器指令 +---------- + +``AC_failure_clusters``(``runs`` / ``threshold`` / ``min_size``)與 +``AC_cofailure_pairs``(``runs`` / ``threshold``)。皆以唯讀 ``ac_*`` MCP 工具及 Script Builder +指令(位於 **Testing** 分類下)形式提供。 diff --git a/je_auto_control/__init__.py b/je_auto_control/__init__.py index 35482895..39c53d8a 100644 --- a/je_auto_control/__init__.py +++ b/je_auto_control/__init__.py @@ -94,6 +94,8 @@ ) # Run-trace diff (LCS-aligned: added/removed steps, status flips, regressions) from je_auto_control.utils.run_diff import diff_runs, summarize_run_diff +# Flaky-test co-failure clustering (Jaccard over shared failing runs) +from je_auto_control.utils.flake_cluster import cofailure_pairs, failure_clusters # VLM element locator (headless) from je_auto_control.utils.vision import ( VLMNotAvailableError, click_by_description, locate_by_description, @@ -1673,6 +1675,7 @@ def start_autocontrol_gui(*args, **kwargs): "saliency_map", "salient_regions", "most_salient", "normalize_error", "failure_signature", "group_failures", "diff_runs", "summarize_run_diff", + "cofailure_pairs", "failure_clusters", # VLM locator "VLMNotAvailableError", "locate_by_description", "click_by_description", "verify_description", diff --git a/je_auto_control/gui/script_builder/command_schema.py b/je_auto_control/gui/script_builder/command_schema.py index 9dc991d9..b26d514f 100644 --- a/je_auto_control/gui/script_builder/command_schema.py +++ b/je_auto_control/gui/script_builder/command_schema.py @@ -2736,6 +2736,25 @@ def _add_audit_specs(specs: List[CommandSpec]) -> None: ), description="LCS-align two run step-traces: added/removed/flips/regress.", )) + specs.append(CommandSpec( + "AC_failure_clusters", "Testing", "Cluster Co-Failing Tests", + fields=( + FieldSpec("runs", FieldType.STRING, + placeholder='[["test_a", "test_b"], ["test_a", "test_b"]]'), + FieldSpec("threshold", FieldType.FLOAT, optional=True, default=0.5), + FieldSpec("min_size", FieldType.INT, optional=True, default=2), + ), + description="Cluster tests that flake together (co-failure Jaccard).", + )) + specs.append(CommandSpec( + "AC_cofailure_pairs", "Testing", "Co-Failing Test Pairs", + fields=( + FieldSpec("runs", FieldType.STRING, + placeholder='[["test_a", "test_b"]]'), + FieldSpec("threshold", FieldType.FLOAT, optional=True, default=0.5), + ), + description="Test pairs that fail together above a Jaccard threshold.", + )) specs.append(CommandSpec( "AC_scan_secrets", "Tools", "Scan for Hardcoded Secrets", description="Scan 'data' (JSON view) for hardcoded secrets that " diff --git a/je_auto_control/utils/executor/action_executor.py b/je_auto_control/utils/executor/action_executor.py index 4c1706c6..377cb4f9 100644 --- a/je_auto_control/utils/executor/action_executor.py +++ b/je_auto_control/utils/executor/action_executor.py @@ -4380,6 +4380,28 @@ def _diff_runs(before: Any, after: Any, key: str = "name", return {**diff, "summary": summarize_run_diff(diff)} +def _failure_clusters(runs: Any, threshold: Any = 0.5, + min_size: Any = 2) -> Dict[str, Any]: + """Adapter: cluster tests that fail together (co-failure Jaccard).""" + import json + from je_auto_control.utils.flake_cluster import failure_clusters + if isinstance(runs, str): + runs = json.loads(runs) + clusters = failure_clusters(runs, threshold=float(threshold), + min_size=int(min_size)) + return {"clusters": clusters, "count": len(clusters)} + + +def _cofailure_pairs(runs: Any, threshold: Any = 0.5) -> Dict[str, Any]: + """Adapter: test pairs that fail together above a Jaccard threshold.""" + import json + from je_auto_control.utils.flake_cluster import cofailure_pairs + if isinstance(runs, str): + runs = json.loads(runs) + pairs = cofailure_pairs(runs, threshold=float(threshold)) + return {"pairs": pairs, "count": len(pairs)} + + def _image_histogram(source: Any = None, bins: Any = 32, space: str = "hsv", region: Any = None) -> Dict[str, Any]: """Adapter: per-channel colour histogram of an image / the screen.""" @@ -6611,6 +6633,8 @@ def __init__(self): "AC_failure_signature": _failure_signature, "AC_group_failures": _group_failures, "AC_diff_runs": _diff_runs, + "AC_failure_clusters": _failure_clusters, + "AC_cofailure_pairs": _cofailure_pairs, "AC_image_histogram": _image_histogram, "AC_histogram_changed": _histogram_changed, "AC_changed_regions": _changed_regions, diff --git a/je_auto_control/utils/flake_cluster/__init__.py b/je_auto_control/utils/flake_cluster/__init__.py new file mode 100644 index 00000000..376dbb05 --- /dev/null +++ b/je_auto_control/utils/flake_cluster/__init__.py @@ -0,0 +1,6 @@ +"""Cluster tests that flake together by co-failure Jaccard similarity.""" +from je_auto_control.utils.flake_cluster.flake_cluster import ( + cofailure_pairs, failure_clusters, +) + +__all__ = ["cofailure_pairs", "failure_clusters"] diff --git a/je_auto_control/utils/flake_cluster/flake_cluster.py b/je_auto_control/utils/flake_cluster/flake_cluster.py new file mode 100644 index 00000000..d35d261c --- /dev/null +++ b/je_auto_control/utils/flake_cluster/flake_cluster.py @@ -0,0 +1,97 @@ +"""Cluster tests that flake *together* by co-failure similarity. + +Flaky tests are rarely independent: a wobbly shared fixture, a slow dependency or +a noisy environment makes a *group* of tests fail in the same runs (research finds +~75% of flaky tests fall into co-failure clusters). Ranking tests one-by-one by +flip rate misses that shared root cause. ``flake_cluster`` measures how often each +pair of tests fails in the *same* runs (Jaccard similarity over the set of runs +each failed in) and groups tests whose co-failure exceeds a threshold into +clusters — so you can chase one root cause instead of N symptoms. + +Input is a list of runs, each a collection of the test names that failed in that +run. Pure standard library; no device, no ``PySide6``. +""" +from itertools import combinations +from typing import Any, Dict, List, Sequence, Set + + +def _set_jaccard(left: Set[int], right: Set[int]) -> float: + union = left | right + return len(left & right) / len(union) if union else 0.0 + + +def _fail_runs(runs: Sequence[Sequence[str]]) -> Dict[str, Set[int]]: + """Map each test name to the set of run indices in which it failed.""" + fails: Dict[str, Set[int]] = {} + for index, run in enumerate(runs): + for test in set(run): + fails.setdefault(str(test), set()).add(index) + return fails + + +def cofailure_pairs(runs: Sequence[Sequence[str]], *, + threshold: float = 0.5) -> List[Dict[str, Any]]: + """Return test pairs whose co-failure Jaccard meets ``threshold``. + + Each entry is ``{tests:[a,b], jaccard, co_failures}``, most similar first. + """ + fails = _fail_runs(runs) + pairs = [] + for left, right in combinations(sorted(fails), 2): + score = _set_jaccard(fails[left], fails[right]) + if score >= float(threshold): + pairs.append({"tests": [left, right], "jaccard": round(score, 3), + "co_failures": len(fails[left] & fails[right])}) + pairs.sort(key=lambda pair: pair["jaccard"], reverse=True) + return pairs + + +def _connected_components(nodes: Sequence[str], + adjacency: Dict[str, Set[str]]) -> List[List[str]]: + seen: Set[str] = set() + components = [] + for node in nodes: + if node in seen: + continue + stack, component = [node], [] + while stack: + current = stack.pop() + if current in seen: + continue + seen.add(current) + component.append(current) + stack.extend(adjacency[current] - seen) + components.append(component) + return components + + +def _cohesion(component: Sequence[str], fails: Dict[str, Set[int]]) -> float: + scores = [_set_jaccard(fails[a], fails[b]) + for a, b in combinations(component, 2)] + return round(sum(scores) / len(scores), 3) if scores else 1.0 + + +def failure_clusters(runs: Sequence[Sequence[str]], *, threshold: float = 0.5, + min_size: int = 2) -> List[Dict[str, Any]]: + """Group tests that fail together into co-failure clusters. + + Builds a graph linking test pairs whose co-failure Jaccard meets + ``threshold``, then returns its connected components of at least ``min_size`` + tests as ``[{tests, size, cohesion}]`` (largest / most cohesive first). + ``cohesion`` is the mean pairwise Jaccard within the cluster. + """ + fails = _fail_runs(runs) + tests = sorted(fails) + adjacency: Dict[str, Set[str]] = {test: set() for test in tests} + for left, right in combinations(tests, 2): + if _set_jaccard(fails[left], fails[right]) >= float(threshold): + adjacency[left].add(right) + adjacency[right].add(left) + clusters = [] + for component in _connected_components(tests, adjacency): + if len(component) >= int(min_size): + clusters.append({"tests": sorted(component), "size": len(component), + "cohesion": _cohesion(component, fails)}) + clusters.sort(key=lambda cluster: (cluster["size"], cluster["cohesion"]), + reverse=True) + return clusters diff --git a/je_auto_control/utils/mcp_server/tools/_factories.py b/je_auto_control/utils/mcp_server/tools/_factories.py index a0934ecb..95752986 100644 --- a/je_auto_control/utils/mcp_server/tools/_factories.py +++ b/je_auto_control/utils/mcp_server/tools/_factories.py @@ -7691,6 +7691,35 @@ def flakiness_tools() -> List[MCPTool]: handler=h.diff_runs, annotations=READ_ONLY, ), + MCPTool( + name="ac_failure_clusters", + description=("Cluster tests that flake TOGETHER: 'runs' is a list of " + "runs, each the list of test names that failed in that " + "run. Groups tests whose co-failure Jaccard >= " + "'threshold'. Returns {clusters:[{tests,size,cohesion}], " + "count} — chase one root cause, not N symptoms."), + input_schema=schema({ + "runs": {"type": "array", + "items": {"type": "array", "items": {"type": "string"}}}, + "threshold": {"type": "number"}, + "min_size": {"type": "integer"}}, + required=["runs"]), + handler=h.failure_clusters, + annotations=READ_ONLY, + ), + MCPTool( + name="ac_cofailure_pairs", + description=("Test pairs that fail together above a Jaccard " + "'threshold' over shared failing runs: " + "{pairs:[{tests,jaccard,co_failures}], count}."), + input_schema=schema({ + "runs": {"type": "array", + "items": {"type": "array", "items": {"type": "string"}}}, + "threshold": {"type": "number"}}, + required=["runs"]), + handler=h.cofailure_pairs, + annotations=READ_ONLY, + ), ] diff --git a/je_auto_control/utils/mcp_server/tools/_handlers.py b/je_auto_control/utils/mcp_server/tools/_handlers.py index 0914842b..34319fcd 100644 --- a/je_auto_control/utils/mcp_server/tools/_handlers.py +++ b/je_auto_control/utils/mcp_server/tools/_handlers.py @@ -2558,6 +2558,16 @@ def diff_runs(before, after, key="name", regress_factor=1.5): return _diff_runs(before, after, key, regress_factor) +def failure_clusters(runs, threshold=0.5, min_size=2): + from je_auto_control.utils.executor.action_executor import _failure_clusters + return _failure_clusters(runs, threshold, min_size) + + +def cofailure_pairs(runs, threshold=0.5): + from je_auto_control.utils.executor.action_executor import _cofailure_pairs + return _cofailure_pairs(runs, threshold) + + def image_histogram(source=None, bins=32, space="hsv", region=None): from je_auto_control.utils.executor.action_executor import _image_histogram return _image_histogram(source, bins, space, region) diff --git a/test/unit_test/headless/test_flake_cluster_batch.py b/test/unit_test/headless/test_flake_cluster_batch.py new file mode 100644 index 00000000..e415668e --- /dev/null +++ b/test/unit_test/headless/test_flake_cluster_batch.py @@ -0,0 +1,83 @@ +"""Headless tests for co-failure flake clustering (Jaccard over failing runs).""" +import pytest + +import je_auto_control as ac +from je_auto_control.utils.flake_cluster import cofailure_pairs, failure_clusters + + +def _runs(): + # a&b always fail together; c&d always together; e fails alone + return [ + ["a", "b"], + ["a", "b"], + ["c", "d"], + ["a", "b", "c", "d"], + ["e"], + ["c", "d"], + ] + + +def test_clusters_group_cofailing_tests(): + clusters = failure_clusters(_runs(), threshold=0.6) + grouped = sorted(tuple(c["tests"]) for c in clusters) + assert grouped == [("a", "b"), ("c", "d")] + assert all(c["cohesion"] == pytest.approx(1.0) for c in clusters) + assert all(c["size"] == 2 for c in clusters) + + +def test_singleton_excluded_by_min_size(): + # 'e' never co-fails, so it is not in any cluster of size >= 2 + tests_in_clusters = {t for c in failure_clusters(_runs()) for t in c["tests"]} + assert "e" not in tests_in_clusters + + +def test_min_size_includes_singletons_when_one(): + clusters = failure_clusters([["e"]], threshold=0.5, min_size=1) + assert clusters == [{"tests": ["e"], "size": 1, "cohesion": 1.0}] + + +def test_high_threshold_keeps_only_perfect_cofailure(): + # a&b co-fail perfectly (jaccard 1.0); a&c only sometimes + clusters = failure_clusters(_runs(), threshold=0.95) + assert sorted(tuple(c["tests"]) for c in clusters) == [("a", "b"), ("c", "d")] + + +def test_cofailure_pairs_scores_and_sorted(): + pairs = cofailure_pairs(_runs(), threshold=0.6) + assert {tuple(p["tests"]) for p in pairs} == {("a", "b"), ("c", "d")} + assert pairs[0]["jaccard"] == pytest.approx(1.0) + assert pairs[0]["co_failures"] == 3 + + +def test_empty_and_no_cofailure(): + assert failure_clusters([]) == [] + assert failure_clusters([["x"], ["y"], ["z"]]) == [] # nobody co-fails + + +# --- wiring --------------------------------------------------------------- + +def test_executor_paths(): + import json + from je_auto_control.utils.executor.action_executor import ( + _cofailure_pairs, _failure_clusters) + runs_json = json.dumps(_runs()) + clusters = _failure_clusters(runs_json, threshold=0.6) + assert clusters["count"] == 2 + pairs = _cofailure_pairs(runs_json, threshold=0.6) + assert pairs["count"] == 2 + + +def test_wiring(): + known = set(ac.executor.known_commands()) + assert {"AC_failure_clusters", "AC_cofailure_pairs"} <= known + from je_auto_control.utils.mcp_server.tools import build_default_tool_registry + names = {t.name for t in build_default_tool_registry()} + assert {"ac_failure_clusters", "ac_cofailure_pairs"} <= names + from je_auto_control.gui.script_builder.command_schema import _build_specs + specs = {s.command for s in _build_specs()} + assert {"AC_failure_clusters", "AC_cofailure_pairs"} <= specs + + +def test_facade_exports(): + for name in ("cofailure_pairs", "failure_clusters"): + assert hasattr(ac, name) and name in ac.__all__