Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -147,3 +147,63 @@ jobs:
name: playwright-report
path: apps/code/playwright-report/
retention-days: 7

e2e:
# Live-model e2e for the @posthog/agent adapters (claude + codex). Runs only
# after the unit + integration jobs pass — a red tree never reaches the
# gateway. Opt-in and safe by default: without vars.AGENT_E2E_ENABLED it is
# skipped, and even when enabled it self-skips every arm unless the
# E2E_GATEWAY_TOKEN secret is present (fork PRs never see it) and
# E2E_GATEWAY_URL points at a runner-reachable gateway. Drives cheap models
# (claude-haiku-4-5 / gpt-5-mini), so an enabled run is a handful of short turns.
needs: [unit-test, integration-test]
# Enabled at the org level, and skipped on fork PRs — secrets (the gateway
# token) are withheld from forks, so the fail-loud token guard would red them
# spuriously. Same-repo PRs get the secret and enforce the guard.
if: ${{ vars.AGENT_E2E_ENABLED == 'true' && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) }}
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
contents: read
steps:
- name: Checkout
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
persist-credentials: false

- name: Setup pnpm
uses: pnpm/action-setup@b906affcce14559ad1aafd4ab0e942779e9f58b1 # v4.3.0

- name: Setup Node.js
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
with:
node-version: 22
cache: "pnpm"

- name: Install dependencies
run: pnpm install --frozen-lockfile

- name: Build agent dependencies
run: |
pnpm --filter @posthog/shared run build
pnpm --filter @posthog/git run build
pnpm --filter @posthog/enricher run build

- name: Download native codex binary
# Non-fatal at the STEP so a failure surfaces as the fail-loud binary guard
# (guard.e2e.test.ts) with a clear message rather than an opaque download
# error. A missing binary then REDS the run (the guard fails when a token is
# set) instead of letting the codex arm silently skip to green.
run: node apps/code/scripts/download-binaries.mjs || echo "codex binary download failed; the binary guard test will red the run"

- name: Run live e2e (both adapters)
run: pnpm --filter agent run test:e2e
env:
E2E_GATEWAY_TOKEN: ${{ secrets.E2E_GATEWAY_TOKEN }}
E2E_GATEWAY_URL: ${{ vars.E2E_GATEWAY_URL }}
E2E_CLAUDE_MODEL: ${{ vars.E2E_CLAUDE_MODEL }}
E2E_CODEX_MODEL: ${{ vars.E2E_CODEX_MODEL }}
# Optional: set vars.E2E_ENVIRONMENT=cloud to exercise the cloud code
# path (sandbox/permission-profile gating). Unset = local. The OS-sandbox
# enforcement test is macOS-gated, so it doesn't red this linux runner.
E2E_ENVIRONMENT: ${{ vars.E2E_ENVIRONMENT }}
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ bin/

# tsup bundled config artifacts (temporary files left behind when bundling TS configs)
*.config.bundled_*.mjs
# vite bundled config artifacts (left behind when a vitest run is interrupted)
*.config.ts.timestamp-*.mjs

# Environment
.env
Expand Down
278 changes: 278 additions & 0 deletions CODEX_APP_SERVER_TESTING.md

Large diffs are not rendered by default.

94 changes: 94 additions & 0 deletions packages/agent/e2e/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Live agent e2e suite

Drives representative sessions **end to end** through the real adapter, the real
binary (codex `app-server` / Claude Code CLI), and the real llm-gateway on a cheap
model — parametrized across `claude` and `codex`. The only thing mocked is the
host/UI client (a recording `sessionUpdate`, an auto-allow `requestPermission`,
and real file read/write against a throwaway git repo). Nothing in the
agent/model/tool path is stubbed.

## What it covers

Two suites, each a per-adapter loop with `describe.skipIf` over `["claude",
"codex"]` (titles carry a `(claude)` / `(codex)` marker so `-t "(codex)"` selects
one arm across both files):

`session-lifecycle.e2e.test.ts` — one shared golden turn plus focused scenarios:
- **newSession config options** — model / effort selectors are offered.
- **working turn** — `initialize → newSession → prompt` (read a file, edit a
line, run a command): streamed assistant text, tool calls + a completed tool
call, the exact usage signal, `stopReason: end_turn`, the real on-disk file
edit, and (codex) the `_posthog/sdk_session` + `_posthog/turn_complete`
ext-notifications.
- **setSessionConfigOption** — switching a config option is accepted + acked.
- **interrupt** — `cancel` during an in-flight (unbounded) turn yields `cancelled`.
- **resumeSession** — reconnect returns config options.
- **loadSession** — a fresh connection reattaches and the transcript replays
(asserts the tool transcript replays, not just any update).

Codex-only (advertised codex capabilities; registered as skipped on the claude
arm so the gap is visible):
- **mode switch** → `current_mode_update`.
- **steering** — a mid-turn prompt folds into the running turn via `turn/steer`.
- **list + fork** — `listSessions` finds the session; `forkSession` branches it.

The command/file approval `{decision}` round-trip is **not** covered here: codex
spawns under a `danger-full-access` sandbox and auto-approves, so it never sends
an approval request to assert on. That envelope is covered by unit tests instead.

`structured-output.e2e.test.ts` — `_meta.jsonSchema` + `onStructuredOutput`
delivers a parsed, schema-constrained object (the signals-pipeline contract).

Assertions are structural lifecycle invariants + the deterministic file/JSON
side effects — never model prose — so they hold across adapters and cheap models.

## Structure

- `config.ts` — gateway/token/model resolution, per-adapter env wiring, skip logic.
- `driver.ts` — the in-process ACP host client (recording capture, auto-allow,
real FS), `openConnection` / `openSession` helpers, the throwaway-repo helpers,
and `waitFor`.
- `*.e2e.test.ts` — the scenarios.

## Running

These never run under `pnpm test` or per-PR CI (the default vitest config only
includes `src/**`). They are opt-in and cost a couple of short model turns.

In CI they run as the **`e2e` job in `.github/workflows/test.yml`**, on pull
requests only, after the unit + integration jobs pass. The job is opt-in and safe
by default: it self-skips unless the repo variable `AGENT_E2E_ENABLED` is `true`
with an `E2E_GATEWAY_TOKEN` secret and an `E2E_GATEWAY_URL` variable pointing at a
gateway reachable from the runner, and it never runs for fork PRs (their secrets
are withheld, which would otherwise red the fail-loud token guard). Off by
default, so it costs nothing until explicitly enabled; the codex arm self-skips if
the native binary isn't on the runner.

```bash
# from packages/agent — reads the local dev API key from the posthog repo, runs both arms
bash e2e/run-e2e.sh

# just one adapter (matches the (codex) / (claude) marker in every title)
bash e2e/run-e2e.sh -t "(codex)"
```

Prereqs: a local llm-gateway up (`./bin/start` in the posthog repo) and the
native codex binary present at `apps/code/resources/codex-acp/codex` (the codex
arm self-skips if it is missing).

## Configuration (env)

| Var | Default | Notes |
| --- | --- | --- |
| `E2E_GATEWAY_TOKEN` | — | Required. A token the gateway accepts — the `llm_gateway` product takes a personal API key (no OAuth). Without it every arm skips. `run-e2e.sh` reads the local dev key. |
| `E2E_GATEWAY_URL` | `http://localhost:3308/llm_gateway` | Gateway base (codex appends `/v1`). `llm_gateway` accepts a personal API key; `posthog_code` is OAuth-only. |
| `E2E_CLAUDE_MODEL` | `claude-haiku-4-5` | Override if the gateway serves a different cheap Claude id. |
| `E2E_CODEX_MODEL` | `gpt-5-mini` | Cheapest codex id the local gateway serves; override if needed. |
| `POSTHOG_REPO` | sibling `../posthog` | Where `run-e2e.sh` reads the local dev key from. |
| `E2E_DEBUG` | — | `1` for verbose adapter logging. |

If a default model isn't served by your gateway, the turn fails loudly (never a
false green) — set the matching `E2E_*_MODEL`.

Each arm self-skips with a visible reason (missing token / missing binary) rather
than passing silently.
101 changes: 101 additions & 0 deletions packages/agent/e2e/compaction.e2e.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
import { afterAll, beforeAll, describe, expect, it } from "vitest";
import { type Adapter, E2E } from "./config";
import {
cleanupRepo,
killCodexStragglers,
openSession,
setupRepo,
} from "./driver";

/**
* Live compaction e2e — codex only. codex auto-compacts when the context crosses
* `model_auto_compact_token_limit`; we spawn with a low limit and a big cheap input
* blob so a later turn trips it, and the adapter must surface `_posthog/compact_boundary`.
* Claude is excluded: its manual `/compact` hangs `prompt()` and forcing auto
* compaction is too costly. Tuning: if it never compacts, raise the limit and FILLER together.
*/
const ADAPTERS: Adapter[] = ["codex"];

// A limit above codex's resident baseline, with FILLER > limit so the crossing is baseline-independent.
const AUTO_COMPACT_TOKEN_LIMIT = 16000;
// ~20k tokens (~45 chars ≈ 11 tokens × 1800) — larger than the limit above.
const FILLER = "The quick brown fox jumps over the lazy dog. ".repeat(1800);
const MAX_CODEX_TURNS = 3;

for (const adapter of ADAPTERS) {
const skip = E2E.skipReason(adapter);
const title = `compaction (${adapter})${skip ? ` — SKIPPED (${skip})` : ""}`;

describe.skipIf(!!skip)(title, () => {
let repo: string;

beforeAll(() => {
if (adapter === "codex") killCodexStragglers();
E2E.configureEnv(adapter);
repo = setupRepo();
});

afterAll(() => {
cleanupRepo(repo);
});

it("surfaces a compaction to the host via compact_boundary", async () => {
const s = await openSession({
adapter,
cwd: repo,
codexOptions:
adapter === "codex"
? E2E.codexOptions(repo, {
// The model-scoped key is the effective one; set both to be safe.
model_auto_compact_token_limit: AUTO_COMPACT_TOKEN_LIMIT,
auto_compact_token_limit: AUTO_COMPACT_TOKEN_LIMIT,
})
: undefined,
meta: {
systemPrompt: "You are a coding assistant in a tiny test repo.",
model: E2E.model(adapter),
permissionMode: "bypassPermissions",
taskRunId: "e2e-compaction",
},
});
try {
const compacted = () =>
s.capture.extMethods().includes("_posthog/compact_boundary");

if (adapter === "claude") {
// A little conversation, then the cheap deterministic trigger: manual /compact.
await s.conn.prompt({
sessionId: s.sessionId,
prompt: [{ type: "text", text: "Reply with only: hello." }],
});
await s.conn.prompt({
sessionId: s.sessionId,
prompt: [{ type: "text", text: "/compact" }],
});
} else {
// codex: turn 1's big input blob fills the context past the limit; turn 2+
// trips auto-compaction. Stop once the boundary is surfaced.
for (let i = 0; i < MAX_CODEX_TURNS && !compacted(); i++) {
const text =
i === 0
? `Reference text — do not summarize, reply with only: OK.\n\n${FILLER}`
: "Reply with only: DONE.";
await s.conn.prompt({
sessionId: s.sessionId,
prompt: [{ type: "text", text }],
});
}
}

expect(
compacted(),
`expected a _posthog/compact_boundary; saw methods: ${s.capture
.extMethods()
.join(", ")}`,
).toBe(true);
} finally {
await s.cleanup();
}
}, 300_000);
});
}
106 changes: 106 additions & 0 deletions packages/agent/e2e/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import { existsSync } from "node:fs";
import { join } from "node:path";

export type Adapter = "claude" | "codex";

/**
* Live e2e configuration, resolved entirely from the environment so no secret is
* committed. Needs a local llm-gateway and a token in `E2E_GATEWAY_TOKEN`; targets
* the `llm_gateway` product, which accepts a personal API key (no OAuth mint,
* unlike prod's `posthog_code`). Without the token every arm self-skips.
*/
// `||` not `??`: CI sets unset vars to "" which should fall back to the default.
const GATEWAY_URL =
process.env.E2E_GATEWAY_URL || "http://localhost:3308/llm_gateway";
const TOKEN = process.env.E2E_GATEWAY_TOKEN ?? "";

// The native app-server binary, relative to packages/agent/e2e.
const NATIVE_CODEX_BIN = join(
__dirname,
"..",
"..",
"..",
"apps",
"code",
"resources",
"codex-acp",
"codex",
);

/** The gateway base with a trailing `/v1` (codex / OpenAI-format endpoint). */
function openAiBase(): string {
return GATEWAY_URL.endsWith("/v1") ? GATEWAY_URL : `${GATEWAY_URL}/v1`;
}

export const E2E = {
token: TOKEN,
hasToken: !!TOKEN,
gatewayUrl: GATEWAY_URL,
codexBin: NATIVE_CODEX_BIN,
/** Deployment environment. `E2E_ENVIRONMENT=cloud` exercises the cloud code path; undefined = local. */
environment:
(process.env.E2E_ENVIRONMENT as "local" | "cloud" | undefined) || undefined,

/** Cheap model per adapter, overridable via `E2E_CLAUDE_MODEL` / `E2E_CODEX_MODEL`. */
model(adapter: Adapter): string {
// `||` so an empty CI variable falls back to the default.
if (adapter === "claude") {
return process.env.E2E_CLAUDE_MODEL || "claude-haiku-4-5";
}
// gpt-5-mini is on the product block list, but that gate is only enforced in
// Agent.run — the e2e drives createAcpConnection directly, so it's accepted.
return process.env.E2E_CODEX_MODEL || "gpt-5-mini";
},

/** Null => runnable; a string => skip this arm with that reason (never silent). */
skipReason(adapter: Adapter): string | null {
if (!TOKEN) return "E2E_GATEWAY_TOKEN not set";
if (adapter === "codex" && !existsSync(NATIVE_CODEX_BIN)) {
return `native codex binary missing at ${NATIVE_CODEX_BIN}`;
}
return null;
},

/** Point the adapter at the gateway as the host's `configureEnvironment` does. */
configureEnv(adapter: Adapter): void {
if (adapter === "claude") {
process.env.ANTHROPIC_BASE_URL = GATEWAY_URL;
process.env.ANTHROPIC_AUTH_TOKEN = TOKEN;
return;
}
process.env.OPENAI_BASE_URL = openAiBase();
process.env.OPENAI_API_KEY = TOKEN;
process.env.POSTHOG_CODEX_USE_APP_SERVER = "1";
},

/** The codexOptions the codex arm passes through `createAcpConnection`. */
codexOptions(
cwd: string,
configOverrides?: Record<string, string | number>,
modelOverride?: string,
): {
cwd: string;
binaryPath: string;
apiBaseUrl: string;
apiKey: string;
model: string;
configOverrides?: Record<string, string | number>;
} {
return {
cwd,
binaryPath: NATIVE_CODEX_BIN,
apiBaseUrl: openAiBase(),
apiKey: TOKEN,
model: modelOverride || this.model("codex"),
...(configOverrides ? { configOverrides } : {}),
};
},

/** A stronger model for tests the cheapest models can't handle (e.g. structured-output decodes). */
strongModel(adapter: Adapter): string {
if (adapter === "claude") {
return process.env.E2E_CLAUDE_MODEL || "claude-sonnet-4-5";
}
return process.env.E2E_CODEX_MODEL || "gpt-5.5";
},
};
Loading
Loading