From 64f575a5a24917b0d1184b200e36d85e97317c95 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= <thymikee@gmail.com>
Date: Mon, 11 May 2026 11:29:47 +0200
Subject: [PATCH] chore: upgrade skillgym to 0.8.0

---
 package.json                                  |  2 +-
 pnpm-lock.yaml                                | 10 +++---
 test/skillgym/README.md                       | 18 ++++++++---
 .../suites/agent-device-smoke-suite.ts        | 32 ++++++++++++-------
 4 files changed, 40 insertions(+), 22 deletions(-)

diff --git a/package.json b/package.json
index 1ab9730ac..f7a3d101b 100644
--- a/package.json
+++ b/package.json
@@ -166,7 +166,7 @@
     "fallow": "^2.52.0",
     "oxfmt": "^0.42.0",
     "oxlint": "^1.57.0",
-    "skillgym": "^0.6.0",
+    "skillgym": "^0.8.0",
     "typescript": "^6.0.2",
     "vite": "^8.0.10"
   }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index ab91178ba..76bb8ce55 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -37,8 +37,8 @@ importers:
         specifier: ^1.57.0
         version: 1.57.0
       skillgym:
-        specifier: ^0.6.0
-        version: 0.6.0
+        specifier: ^0.8.0
+        version: 0.8.0
       typescript:
         specifier: ^6.0.2
         version: 6.0.2
@@ -1824,8 +1824,8 @@ packages:
   siginfo@2.0.0:
     resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==}
 
-  skillgym@0.6.0:
-    resolution: {integrity: sha512-wIyZgWfZCNt5uo61Awz7c893MNS+KID9iJiNwBKGyI1ddzYuUjKGT4STSejCFcVXZrjhX7tdmhV1U0fQ/3+Ceg==}
+  skillgym@0.8.0:
+    resolution: {integrity: sha512-JR7BI7rYCZ0n9aiEBtaeWawfYHDijlHyV57l+akngatzdl4O//mXTftLxzvMQ3G3O3mG0OA+dIHUn7E2hpPp2A==}
     engines: {node: '>=22.18.0'}
     hasBin: true
 
@@ -4131,7 +4131,7 @@ snapshots:
 
   siginfo@2.0.0: {}
 
-  skillgym@0.6.0:
+  skillgym@0.8.0:
     dependencies:
       cli-spinners: 3.4.0
       nano-spawn: 2.1.0
diff --git a/test/skillgym/README.md b/test/skillgym/README.md
index e6a3ca3af..a1c4a557a 100644
--- a/test/skillgym/README.md
+++ b/test/skillgym/README.md
@@ -11,7 +11,7 @@ This folder is a starter `skillgym` setup for benchmarking the `agent-device` sk
 3. Optional live-device smoke runs: locally, you can extend prompts so the agent actually drives `agent-device` against a simulator or device.
 
 The included suite focuses on the first two layers so it stays stable and CI-safe.
-The suite uses SkillGym v0.6 case tags:
+The suite uses SkillGym v0.8 case tags:
 
 - `fixture-smoke`: fixture-specific app surface coverage
 - `skill-guidance`: command-planning guidance regressions
@@ -50,7 +50,9 @@ Skill-guidance regression cases cover distinct command-planning habits:
 
 The `codex-mini` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold.
 
-SkillGym v0.6 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`.
+SkillGym v0.8 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`.
+The source-read guardrails use `assert.soft.*` plus deferred explain questions so one failing run can report multiple routing mistakes and can later be inspected with `skillgym explain`.
+Suite types use the v0.8 root export name `Case`; older `TestCase` imports no longer typecheck.
 
 ## Suggested workflow
 
@@ -78,7 +80,7 @@ pnpm exec skillgym run \
   --config ./test/skillgym/skillgym.config.ts
 ```
 
-Useful v0.6 filters and reporters:
+Useful v0.8 filters, reporters, and recovery options:
 
 ```bash
 pnpm build
@@ -91,11 +93,19 @@ pnpm exec skillgym run \
   ./test/skillgym/suites/agent-device-smoke-suite.ts \
   --config ./test/skillgym/skillgym.config.ts \
   --reporter json
+
+pnpm exec skillgym run \
+  ./test/skillgym/suites/agent-device-smoke-suite.ts \
+  --config ./test/skillgym/skillgym.config.ts \
+  --repeat 3 \
+  --repeat-failure 1
 ```
 
 Use `--reporter github-actions` in CI when you want annotations in GitHub Actions logs.
 
-The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.6's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel <n>` for local experiments that need a different cap.
+The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.8's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel <n>` for local experiments that need a different cap.
+Use `--repeat <n>` when you want stability sampling rather than a single pass. Use `--repeat-failure <n>` for local benchmark recovery from transient runner failures; keep it off for strict regression checks unless you explicitly want retry artifacts.
+When a run fails on an assertion that records explain questions, run `pnpm exec skillgym explain <artifact-dir>` against the failed `repeat-*` artifact directory to resume the runner and collect its explanation.
 
 Prerequisites:
 
diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts
index 70e328e28..7502b8be9 100644
--- a/test/skillgym/suites/agent-device-smoke-suite.ts
+++ b/test/skillgym/suites/agent-device-smoke-suite.ts
@@ -1,7 +1,7 @@
-import { assert, commandMatcher, type CommandMatcher, type TestCase } from 'skillgym';
+import { assert, commandMatcher, type Case, type CommandMatcher } from 'skillgym';
 
 type SessionReport = Parameters<typeof assert.skills.has>[0];
-type AssertionContext = Parameters<TestCase['assert']>[1];
+type AssertionContext = Parameters<Case['assert']>[1];
 type OutputMatcher = string | RegExp | PlannedCommandMatcher;
 
 interface PlannedCommandMatcher {
@@ -40,7 +40,7 @@ function assertAgentDeviceEvidence(report: SessionReport) {
   // Some SkillGym runners do not expose skill telemetry. Keep this as a conditional routing
   // assertion instead of failing otherwise valid command-planning runs on missing metadata.
   if (hasDetectedSkills) {
-    assert.ok(
+    assert.soft.ok(
       hasBundledDeviceSkill,
       `Expected detectedSkills to include an agent-device bundled skill. Observed detectedSkills: ${detectedSkills
         .map((skill) => `${skill.skill} (${skill.confidence})`)
@@ -50,9 +50,15 @@ function assertAgentDeviceEvidence(report: SessionReport) {
 }
 
 function assertNoProjectSourceReads(report: SessionReport) {
-  assert.fileReads.notIncludes(report, APP_SOURCE);
-  assert.fileReads.notIncludes(report, REPO_SOURCE);
-  assert.fileReads.notIncludes(report, COMMAND_DOCS);
+  assert.soft.fileReads.notIncludes(report, APP_SOURCE, {
+    explain: { question: 'Why did you read the fixture app source instead of using CLI help?' },
+  });
+  assert.soft.fileReads.notIncludes(report, REPO_SOURCE, {
+    explain: { question: 'Why did you read repo source files instead of using CLI help?' },
+  });
+  assert.soft.fileReads.notIncludes(report, COMMAND_DOCS, {
+    explain: { question: 'Why did you read website command docs instead of local CLI help?' },
+  });
 }
 
 function plannedCommand(command: string): PlannedCommandMatcher {
@@ -194,7 +200,7 @@ function makeCase(options: {
   tags?: string[];
   outputs?: OutputMatcher[];
   forbiddenOutputs?: OutputMatcher[];
-}): TestCase {
+}): Case {
   return {
     id: options.id,
     tags: options.tags,
@@ -202,21 +208,23 @@ function makeCase(options: {
     assert(report, ctx) {
       assertAgentDeviceEvidence(report);
       assertNoProjectSourceReads(report);
-      assert.fileReads.notIncludes(report, SUITE_FILE);
+      assert.soft.fileReads.notIncludes(report, SUITE_FILE, {
+        explain: { question: 'Why did you inspect the benchmark suite while answering?' },
+      });
       assertExpectedOutput(report, ctx, options.outputs);
       assertNoOutputs(ctx.finalOutput(), options.forbiddenOutputs ?? []);
     },
   };
 }
 
-function withTags(tags: string[], cases: TestCase[]): TestCase[] {
+function withTags(tags: string[], cases: Case[]): Case[] {
   return cases.map((testCase) => ({
     ...testCase,
     tags: [...new Set([...(testCase.tags ?? []), ...tags])],
   }));
 }
 
-const FIXTURE_SMOKE_CASES: TestCase[] = [
+const FIXTURE_SMOKE_CASES: Case[] = [
   makeCase({
     id: 'open-and-snapshot',
     contract: [
@@ -516,7 +524,7 @@ const FIXTURE_SMOKE_CASES: TestCase[] = [
   }),
 ];
 
-const SKILL_GUIDANCE_CASES: TestCase[] = [
+const SKILL_GUIDANCE_CASES: Case[] = [
   makeCase({
     id: 'inspect-visible-text-readonly',
     contract: [
@@ -1311,7 +1319,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
   }),
 ];
 
-const suite: TestCase[] = [
+const suite: Case[] = [
   ...withTags(['fixture-smoke'], FIXTURE_SMOKE_CASES),
   ...withTags(['skill-guidance'], SKILL_GUIDANCE_CASES),
 ];