From 64f575a5a24917b0d1184b200e36d85e97317c95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Pierzcha=C5=82a?= Date: Mon, 11 May 2026 11:29:47 +0200 Subject: [PATCH] chore: upgrade skillgym to 0.8.0 --- package.json | 2 +- pnpm-lock.yaml | 10 +++--- test/skillgym/README.md | 18 ++++++++--- .../suites/agent-device-smoke-suite.ts | 32 ++++++++++++------- 4 files changed, 40 insertions(+), 22 deletions(-) diff --git a/package.json b/package.json index 1ab9730ac..f7a3d101b 100644 --- a/package.json +++ b/package.json @@ -166,7 +166,7 @@ "fallow": "^2.52.0", "oxfmt": "^0.42.0", "oxlint": "^1.57.0", - "skillgym": "^0.6.0", + "skillgym": "^0.8.0", "typescript": "^6.0.2", "vite": "^8.0.10" } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ab91178ba..76bb8ce55 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -37,8 +37,8 @@ importers: specifier: ^1.57.0 version: 1.57.0 skillgym: - specifier: ^0.6.0 - version: 0.6.0 + specifier: ^0.8.0 + version: 0.8.0 typescript: specifier: ^6.0.2 version: 6.0.2 @@ -1824,8 +1824,8 @@ packages: siginfo@2.0.0: resolution: {integrity: sha512-ybx0WO1/8bSBLEWXZvEd7gMW3Sn3JFlW3TvX1nREbDLRNQNaeNN8WK0meBwPdAaOI7TtRRRJn/Es1zhrrCHu7g==} - skillgym@0.6.0: - resolution: {integrity: sha512-wIyZgWfZCNt5uo61Awz7c893MNS+KID9iJiNwBKGyI1ddzYuUjKGT4STSejCFcVXZrjhX7tdmhV1U0fQ/3+Ceg==} + skillgym@0.8.0: + resolution: {integrity: sha512-JR7BI7rYCZ0n9aiEBtaeWawfYHDijlHyV57l+akngatzdl4O//mXTftLxzvMQ3G3O3mG0OA+dIHUn7E2hpPp2A==} engines: {node: '>=22.18.0'} hasBin: true @@ -4131,7 +4131,7 @@ snapshots: siginfo@2.0.0: {} - skillgym@0.6.0: + skillgym@0.8.0: dependencies: cli-spinners: 3.4.0 nano-spawn: 2.1.0 diff --git a/test/skillgym/README.md b/test/skillgym/README.md index e6a3ca3af..a1c4a557a 100644 --- a/test/skillgym/README.md +++ b/test/skillgym/README.md @@ -11,7 +11,7 @@ This folder is a starter `skillgym` setup for benchmarking the `agent-device` sk 3. Optional live-device smoke runs: locally, you can extend prompts so the agent actually drives `agent-device` against a simulator or device. The included suite focuses on the first two layers so it stays stable and CI-safe. -The suite uses SkillGym v0.6 case tags: +The suite uses SkillGym v0.8 case tags: - `fixture-smoke`: fixture-specific app surface coverage - `skill-guidance`: command-planning guidance regressions @@ -50,7 +50,9 @@ Skill-guidance regression cases cover distinct command-planning habits: The `codex-mini` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold. -SkillGym v0.6 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`. +SkillGym v0.8 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`. +The source-read guardrails use `assert.soft.*` plus deferred explain questions so one failing run can report multiple routing mistakes and can later be inspected with `skillgym explain`. +Suite types use the v0.8 root export name `Case`; older `TestCase` imports no longer typecheck. ## Suggested workflow @@ -78,7 +80,7 @@ pnpm exec skillgym run \ --config ./test/skillgym/skillgym.config.ts ``` -Useful v0.6 filters and reporters: +Useful v0.8 filters, reporters, and recovery options: ```bash pnpm build @@ -91,11 +93,19 @@ pnpm exec skillgym run \ ./test/skillgym/suites/agent-device-smoke-suite.ts \ --config ./test/skillgym/skillgym.config.ts \ --reporter json + +pnpm exec skillgym run \ + ./test/skillgym/suites/agent-device-smoke-suite.ts \ + --config ./test/skillgym/skillgym.config.ts \ + --repeat 3 \ + --repeat-failure 1 ``` Use `--reporter github-actions` in CI when you want annotations in GitHub Actions logs. -The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.6's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel ` for local experiments that need a different cap. +The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.8's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel ` for local experiments that need a different cap. +Use `--repeat ` when you want stability sampling rather than a single pass. Use `--repeat-failure ` for local benchmark recovery from transient runner failures; keep it off for strict regression checks unless you explicitly want retry artifacts. +When a run fails on an assertion that records explain questions, run `pnpm exec skillgym explain ` against the failed `repeat-*` artifact directory to resume the runner and collect its explanation. Prerequisites: diff --git a/test/skillgym/suites/agent-device-smoke-suite.ts b/test/skillgym/suites/agent-device-smoke-suite.ts index 70e328e28..7502b8be9 100644 --- a/test/skillgym/suites/agent-device-smoke-suite.ts +++ b/test/skillgym/suites/agent-device-smoke-suite.ts @@ -1,7 +1,7 @@ -import { assert, commandMatcher, type CommandMatcher, type TestCase } from 'skillgym'; +import { assert, commandMatcher, type Case, type CommandMatcher } from 'skillgym'; type SessionReport = Parameters[0]; -type AssertionContext = Parameters[1]; +type AssertionContext = Parameters[1]; type OutputMatcher = string | RegExp | PlannedCommandMatcher; interface PlannedCommandMatcher { @@ -40,7 +40,7 @@ function assertAgentDeviceEvidence(report: SessionReport) { // Some SkillGym runners do not expose skill telemetry. Keep this as a conditional routing // assertion instead of failing otherwise valid command-planning runs on missing metadata. if (hasDetectedSkills) { - assert.ok( + assert.soft.ok( hasBundledDeviceSkill, `Expected detectedSkills to include an agent-device bundled skill. Observed detectedSkills: ${detectedSkills .map((skill) => `${skill.skill} (${skill.confidence})`) @@ -50,9 +50,15 @@ function assertAgentDeviceEvidence(report: SessionReport) { } function assertNoProjectSourceReads(report: SessionReport) { - assert.fileReads.notIncludes(report, APP_SOURCE); - assert.fileReads.notIncludes(report, REPO_SOURCE); - assert.fileReads.notIncludes(report, COMMAND_DOCS); + assert.soft.fileReads.notIncludes(report, APP_SOURCE, { + explain: { question: 'Why did you read the fixture app source instead of using CLI help?' }, + }); + assert.soft.fileReads.notIncludes(report, REPO_SOURCE, { + explain: { question: 'Why did you read repo source files instead of using CLI help?' }, + }); + assert.soft.fileReads.notIncludes(report, COMMAND_DOCS, { + explain: { question: 'Why did you read website command docs instead of local CLI help?' }, + }); } function plannedCommand(command: string): PlannedCommandMatcher { @@ -194,7 +200,7 @@ function makeCase(options: { tags?: string[]; outputs?: OutputMatcher[]; forbiddenOutputs?: OutputMatcher[]; -}): TestCase { +}): Case { return { id: options.id, tags: options.tags, @@ -202,21 +208,23 @@ function makeCase(options: { assert(report, ctx) { assertAgentDeviceEvidence(report); assertNoProjectSourceReads(report); - assert.fileReads.notIncludes(report, SUITE_FILE); + assert.soft.fileReads.notIncludes(report, SUITE_FILE, { + explain: { question: 'Why did you inspect the benchmark suite while answering?' }, + }); assertExpectedOutput(report, ctx, options.outputs); assertNoOutputs(ctx.finalOutput(), options.forbiddenOutputs ?? []); }, }; } -function withTags(tags: string[], cases: TestCase[]): TestCase[] { +function withTags(tags: string[], cases: Case[]): Case[] { return cases.map((testCase) => ({ ...testCase, tags: [...new Set([...(testCase.tags ?? []), ...tags])], })); } -const FIXTURE_SMOKE_CASES: TestCase[] = [ +const FIXTURE_SMOKE_CASES: Case[] = [ makeCase({ id: 'open-and-snapshot', contract: [ @@ -516,7 +524,7 @@ const FIXTURE_SMOKE_CASES: TestCase[] = [ }), ]; -const SKILL_GUIDANCE_CASES: TestCase[] = [ +const SKILL_GUIDANCE_CASES: Case[] = [ makeCase({ id: 'inspect-visible-text-readonly', contract: [ @@ -1311,7 +1319,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [ }), ]; -const suite: TestCase[] = [ +const suite: Case[] = [ ...withTags(['fixture-smoke'], FIXTURE_SMOKE_CASES), ...withTags(['skill-guidance'], SKILL_GUIDANCE_CASES), ];