Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@
"fallow": "^2.52.0",
"oxfmt": "^0.42.0",
"oxlint": "^1.57.0",
"skillgym": "^0.6.0",
"skillgym": "^0.8.0",
"typescript": "^6.0.2",
"vite": "^8.0.10"
}
Expand Down
10 changes: 5 additions & 5 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 14 additions & 4 deletions test/skillgym/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ This folder is a starter `skillgym` setup for benchmarking the `agent-device` sk
3. Optional live-device smoke runs: locally, you can extend prompts so the agent actually drives `agent-device` against a simulator or device.

The included suite focuses on the first two layers so it stays stable and CI-safe.
The suite uses SkillGym v0.6 case tags:
The suite uses SkillGym v0.8 case tags:

- `fixture-smoke`: fixture-specific app surface coverage
- `skill-guidance`: command-planning guidance regressions
Expand Down Expand Up @@ -50,7 +50,9 @@ Skill-guidance regression cases cover distinct command-planning habits:

The `codex-mini` baseline is a benchmark signal, not a required all-green gate. Its failures should map to command-planning regressions called out by individual case IDs; do not treat the historical pass/fail count as a fixed threshold.

SkillGym v0.6 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`.
SkillGym v0.8 command assertions are for observed command events. This suite primarily validates the command plan in the final answer, so it converts final-output command lines into a small planned-command report before calling `assert.commands.includes` or `assert.commands.notIncludes`.
The source-read guardrails use `assert.soft.*` plus deferred explain questions so one failing run can report multiple routing mistakes and can later be inspected with `skillgym explain`.
Suite types use the v0.8 root export name `Case`; older `TestCase` imports no longer typecheck.

## Suggested workflow

Expand Down Expand Up @@ -78,7 +80,7 @@ pnpm exec skillgym run \
--config ./test/skillgym/skillgym.config.ts
```

Useful v0.6 filters and reporters:
Useful v0.8 filters, reporters, and recovery options:

```bash
pnpm build
Expand All @@ -91,11 +93,19 @@ pnpm exec skillgym run \
./test/skillgym/suites/agent-device-smoke-suite.ts \
--config ./test/skillgym/skillgym.config.ts \
--reporter json

pnpm exec skillgym run \
./test/skillgym/suites/agent-device-smoke-suite.ts \
--config ./test/skillgym/skillgym.config.ts \
--repeat 3 \
--repeat-failure 1
```

Use `--reporter github-actions` in CI when you want annotations in GitHub Actions logs.

The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.6's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel <n>` for local experiments that need a different cap.
The config uses `schedule: parallel` so the planning suite can run case/runner pairs concurrently up to SkillGym v0.8's default available-machine parallelism cap. This is safe for the included suite because cases validate command plans and local CLI help, not live shared device state or workspace edits. Override with `--max-parallel <n>` for local experiments that need a different cap.
Use `--repeat <n>` when you want stability sampling rather than a single pass. Use `--repeat-failure <n>` for local benchmark recovery from transient runner failures; keep it off for strict regression checks unless you explicitly want retry artifacts.
When a run fails on an assertion that records explain questions, run `pnpm exec skillgym explain <artifact-dir>` against the failed `repeat-*` artifact directory to resume the runner and collect its explanation.

Prerequisites:

Expand Down
32 changes: 20 additions & 12 deletions test/skillgym/suites/agent-device-smoke-suite.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { assert, commandMatcher, type CommandMatcher, type TestCase } from 'skillgym';
import { assert, commandMatcher, type Case, type CommandMatcher } from 'skillgym';

type SessionReport = Parameters<typeof assert.skills.has>[0];
type AssertionContext = Parameters<TestCase['assert']>[1];
type AssertionContext = Parameters<Case['assert']>[1];
type OutputMatcher = string | RegExp | PlannedCommandMatcher;

interface PlannedCommandMatcher {
Expand Down Expand Up @@ -40,7 +40,7 @@ function assertAgentDeviceEvidence(report: SessionReport) {
// Some SkillGym runners do not expose skill telemetry. Keep this as a conditional routing
// assertion instead of failing otherwise valid command-planning runs on missing metadata.
if (hasDetectedSkills) {
assert.ok(
assert.soft.ok(
hasBundledDeviceSkill,
`Expected detectedSkills to include an agent-device bundled skill. Observed detectedSkills: ${detectedSkills
.map((skill) => `${skill.skill} (${skill.confidence})`)
Expand All @@ -50,9 +50,15 @@ function assertAgentDeviceEvidence(report: SessionReport) {
}

function assertNoProjectSourceReads(report: SessionReport) {
assert.fileReads.notIncludes(report, APP_SOURCE);
assert.fileReads.notIncludes(report, REPO_SOURCE);
assert.fileReads.notIncludes(report, COMMAND_DOCS);
assert.soft.fileReads.notIncludes(report, APP_SOURCE, {
explain: { question: 'Why did you read the fixture app source instead of using CLI help?' },
});
assert.soft.fileReads.notIncludes(report, REPO_SOURCE, {
explain: { question: 'Why did you read repo source files instead of using CLI help?' },
});
assert.soft.fileReads.notIncludes(report, COMMAND_DOCS, {
explain: { question: 'Why did you read website command docs instead of local CLI help?' },
});
}

function plannedCommand(command: string): PlannedCommandMatcher {
Expand Down Expand Up @@ -194,29 +200,31 @@ function makeCase(options: {
tags?: string[];
outputs?: OutputMatcher[];
forbiddenOutputs?: OutputMatcher[];
}): TestCase {
}): Case {
return {
id: options.id,
tags: options.tags,
prompt: buildPrompt({ contract: options.contract, task: options.task }),
assert(report, ctx) {
assertAgentDeviceEvidence(report);
assertNoProjectSourceReads(report);
assert.fileReads.notIncludes(report, SUITE_FILE);
assert.soft.fileReads.notIncludes(report, SUITE_FILE, {
explain: { question: 'Why did you inspect the benchmark suite while answering?' },
});
assertExpectedOutput(report, ctx, options.outputs);
assertNoOutputs(ctx.finalOutput(), options.forbiddenOutputs ?? []);
},
};
}

function withTags(tags: string[], cases: TestCase[]): TestCase[] {
function withTags(tags: string[], cases: Case[]): Case[] {
return cases.map((testCase) => ({
...testCase,
tags: [...new Set([...(testCase.tags ?? []), ...tags])],
}));
}

const FIXTURE_SMOKE_CASES: TestCase[] = [
const FIXTURE_SMOKE_CASES: Case[] = [
makeCase({
id: 'open-and-snapshot',
contract: [
Expand Down Expand Up @@ -516,7 +524,7 @@ const FIXTURE_SMOKE_CASES: TestCase[] = [
}),
];

const SKILL_GUIDANCE_CASES: TestCase[] = [
const SKILL_GUIDANCE_CASES: Case[] = [
makeCase({
id: 'inspect-visible-text-readonly',
contract: [
Expand Down Expand Up @@ -1311,7 +1319,7 @@ const SKILL_GUIDANCE_CASES: TestCase[] = [
}),
];

const suite: TestCase[] = [
const suite: Case[] = [
...withTags(['fixture-smoke'], FIXTURE_SMOKE_CASES),
...withTags(['skill-guidance'], SKILL_GUIDANCE_CASES),
];
Expand Down
Loading