callstackincubator · thymikee · May 11, 2026 · May 11, 2026
diff --git a/package.json b/package.json
@@ -166,7 +166,7 @@
     "fallow": "^2.52.0",
     "oxfmt": "^0.42.0",
     "oxlint": "^1.57.0",
-    "skillgym": "^0.6.0",
+    "skillgym": "^0.8.0",
     "typescript": "^6.0.2",
     "vite": "^8.0.10"
   }

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/test/skillgym/README.md b/test/skillgym/README.md
@@ -11,7 +11,7 @@ This folder is a starter `skillgym` setup for benchmarking the `agent-device` sk
 3. Optional live-device smoke runs: locally, you can extend prompts so the agent actually drives `agent-device` against a simulator or device.
 
 The included suite focuses on the first two layers so it stays stable and CI-safe.
-The suite uses SkillGym v0.6 case tags:
+The suite uses SkillGym v0.8 case tags:
 
 - `fixture-smoke`: fixture-specific app surface coverage
 - `skill-guidance`: command-planning guidance regressions
@@ -50,7 +50,9 @@ Skill-guidance regression cases cover distinct command-planning habits:
 
 The `codex-mini` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold.
 
-SkillGym v0.6 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`.
+SkillGym v0.8 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`.
+The source-read guardrails use `assert.soft.*` plus deferred explain questions so one failing run can report multiple routing mistakes and can later be inspected with `skillgym explain`.
+Suite types use the v0.8 root export name `Case`; older `TestCase` imports no longer typecheck.
 
 ## Suggested workflow
 
@@ -78,7 +80,7 @@ pnpm exec skillgym run \
   --config ./test/skillgym/skillgym.config.ts
 ```
 
-Useful v0.6 filters and reporters:
+Useful v0.8 filters, reporters, and recovery options:
 
 ```bash
 pnpm build
@@ -91,11 +93,19 @@ pnpm exec skillgym run \
   ./test/skillgym/suites/agent-device-smoke-suite.ts \
   --config ./test/skillgym/skillgym.config.ts \
   --reporter json
+
+pnpm exec skillgym run \
+  ./test/skillgym/suites/agent-device-smoke-suite.ts \
+  --config ./test/skillgym/skillgym.config.ts \
+  --repeat 3 \
+  --repeat-failure 1
 ```
 
 Use `--reporter github-actions` in CI when you want annotations in GitHub Actions logs.
 
-The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.6's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel <n>` for local experiments that need a different cap.
+The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.8's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel <n>` for local experiments that need a different cap.
+Use `--repeat <n>` when you want stability sampling rather than a single pass. Use `--repeat-failure <n>` for local benchmark recovery from transient runner failures; keep it off for strict regression checks unless you explicitly want retry artifacts.
+When a run fails on an assertion that records explain questions, run `pnpm exec skillgym explain <artifact-dir>` against the failed `repeat-*` artifact directory to resume the runner and collect its explanation.
 
 Prerequisites:
 

diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts
@@ -1,7 +1,7 @@
-import { assert, commandMatcher, type CommandMatcher, type TestCase } from 'skillgym';
+import { assert, commandMatcher, type Case, type CommandMatcher } from 'skillgym';
 
 type SessionReport = Parameters<typeof assert.skills.has>[0];
-type AssertionContext = Parameters<TestCase['assert']>[1];
+type AssertionContext = Parameters<Case['assert']>[1];
 type OutputMatcher = string | RegExp | PlannedCommandMatcher;
 
 interface PlannedCommandMatcher {
@@ -40,7 +40,7 @@ function assertAgentDeviceEvidence(report: SessionReport) {
   // Some SkillGym runners do not expose skill telemetry. Keep this as a conditional routing
   // assertion instead of failing otherwise valid command-planning runs on missing metadata.
   if (hasDetectedSkills) {
-    assert.ok(
+    assert.soft.ok(
       hasBundledDeviceSkill,
       `Expected detectedSkills to include an agent-device bundled skill. Observed detectedSkills: ${detectedSkills
         .map((skill) => `${skill.skill} (${skill.confidence})`)
@@ -50,9 +50,15 @@ function assertAgentDeviceEvidence(report: SessionReport) {
 }
 
 function assertNoProjectSourceReads(report: SessionReport) {
-  assert.fileReads.notIncludes(report, APP_SOURCE);
-  assert.fileReads.notIncludes(report, REPO_SOURCE);
-  assert.fileReads.notIncludes(report, COMMAND_DOCS);
+  assert.soft.fileReads.notIncludes(report, APP_SOURCE, {
+    explain: { question: 'Why did you read the fixture app source instead of using CLI help?' },
+  });
+  assert.soft.fileReads.notIncludes(report, REPO_SOURCE, {
+    explain: { question: 'Why did you read repo source files instead of using CLI help?' },
+  });
+  assert.soft.fileReads.notIncludes(report, COMMAND_DOCS, {
+    explain: { question: 'Why did you read website command docs instead of local CLI help?' },
+  });
 }
 
 function plannedCommand(command: string): PlannedCommandMatcher {
@@ -194,29 +200,31 @@ function makeCase(options: {
   tags?: string[];
   outputs?: OutputMatcher[];
   forbiddenOutputs?: OutputMatcher[];
-}): TestCase {
+}): Case {
   return {
     id: options.id,
     tags: options.tags,
     prompt: buildPrompt({ contract: options.contract, task: options.task }),
     assert(report, ctx) {
       assertAgentDeviceEvidence(report);
       assertNoProjectSourceReads(report);
-      assert.fileReads.notIncludes(report, SUITE_FILE);
+      assert.soft.fileReads.notIncludes(report, SUITE_FILE, {
+        explain: { question: 'Why did you inspect the benchmark suite while answering?' },
+      });
       assertExpectedOutput(report, ctx, options.outputs);
       assertNoOutputs(ctx.finalOutput(), options.forbiddenOutputs ?? []);
     },
   };
 }
 
-function withTags(tags: string[], cases: TestCase[]): TestCase[] {
+function withTags(tags: string[], cases: Case[]): Case[] {
   return cases.map((testCase) => ({
     ...testCase,
     tags: [...new Set([...(testCase.tags ?? []), ...tags])],
   }));
 }
 
-const FIXTURE_SMOKE_CASES: TestCase[] = [
+const FIXTURE_SMOKE_CASES: Case[] = [
   makeCase({
     id: 'open-and-snapshot',
     contract: [
@@ -516,7 +524,7 @@ const FIXTURE_SMOKE_CASES: TestCase[] = [
   }),
 ];
 
-const SKILL_GUIDANCE_CASES: TestCase[] = [
+const SKILL_GUIDANCE_CASES: Case[] = [
   makeCase({
     id: 'inspect-visible-text-readonly',
     contract: [
@@ -1311,7 +1319,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
   }),
 ];
 
-const suite: TestCase[] = [
+const suite: Case[] = [
   ...withTags(['fixture-smoke'], FIXTURE_SMOKE_CASES),
   ...withTags(['skill-guidance'], SKILL_GUIDANCE_CASES),
 ];