diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5d943aee3b..3de066b56c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -147,3 +147,63 @@ jobs: name: playwright-report path: apps/code/playwright-report/ retention-days: 7 + + e2e: + # Live-model e2e for the @posthog/agent adapters (claude + codex). Runs only + # after the unit + integration jobs pass — a red tree never reaches the + # gateway. Opt-in and safe by default: without vars.AGENT_E2E_ENABLED it is + # skipped, and even when enabled it self-skips every arm unless the + # E2E_GATEWAY_TOKEN secret is present (fork PRs never see it) and + # E2E_GATEWAY_URL points at a runner-reachable gateway. Drives cheap models + # (claude-haiku-4-5 / gpt-5-mini), so an enabled run is a handful of short turns. + needs: [unit-test, integration-test] + # Enabled at the org level, and skipped on fork PRs — secrets (the gateway + # token) are withheld from forks, so the fail-loud token guard would red them + # spuriously. Same-repo PRs get the secret and enforce the guard. + if: ${{ vars.AGENT_E2E_ENABLED == 'true' && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) }} + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + steps: + - name: Checkout + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + with: + persist-credentials: false + + - name: Setup pnpm + uses: pnpm/action-setup@b906affcce14559ad1aafd4ab0e942779e9f58b1 # v4.3.0 + + - name: Setup Node.js + uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0 + with: + node-version: 22 + cache: "pnpm" + + - name: Install dependencies + run: pnpm install --frozen-lockfile + + - name: Build agent dependencies + run: | + pnpm --filter @posthog/shared run build + pnpm --filter @posthog/git run build + pnpm --filter @posthog/enricher run build + + - name: Download native codex binary + # Non-fatal at the STEP so a failure surfaces as the fail-loud binary guard + # (guard.e2e.test.ts) with a clear message rather than an opaque download + # error. A missing binary then REDS the run (the guard fails when a token is + # set) instead of letting the codex arm silently skip to green. + run: node apps/code/scripts/download-binaries.mjs || echo "codex binary download failed; the binary guard test will red the run" + + - name: Run live e2e (both adapters) + run: pnpm --filter agent run test:e2e + env: + E2E_GATEWAY_TOKEN: ${{ secrets.E2E_GATEWAY_TOKEN }} + E2E_GATEWAY_URL: ${{ vars.E2E_GATEWAY_URL }} + E2E_CLAUDE_MODEL: ${{ vars.E2E_CLAUDE_MODEL }} + E2E_CODEX_MODEL: ${{ vars.E2E_CODEX_MODEL }} + # Optional: set vars.E2E_ENVIRONMENT=cloud to exercise the cloud code + # path (sandbox/permission-profile gating). Unset = local. The OS-sandbox + # enforcement test is macOS-gated, so it doesn't red this linux runner. + E2E_ENVIRONMENT: ${{ vars.E2E_ENVIRONMENT }} diff --git a/.gitignore b/.gitignore index fa269ba709..fe3abb2fe8 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,8 @@ bin/ # tsup bundled config artifacts (temporary files left behind when bundling TS configs) *.config.bundled_*.mjs +# vite bundled config artifacts (left behind when a vitest run is interrupted) +*.config.ts.timestamp-*.mjs # Environment .env diff --git a/packages/agent/e2e/README.md b/packages/agent/e2e/README.md new file mode 100644 index 0000000000..21ff545fe7 --- /dev/null +++ b/packages/agent/e2e/README.md @@ -0,0 +1,94 @@ +# Live agent e2e suite + +Drives representative sessions **end to end** through the real adapter, the real +binary (codex `app-server` / Claude Code CLI), and the real llm-gateway on a cheap +model — parametrized across `claude` and `codex`. The only thing mocked is the +host/UI client (a recording `sessionUpdate`, an auto-allow `requestPermission`, +and real file read/write against a throwaway git repo). Nothing in the +agent/model/tool path is stubbed. + +## What it covers + +Two suites, each a per-adapter loop with `describe.skipIf` over `["claude", +"codex"]` (titles carry a `(claude)` / `(codex)` marker so `-t "(codex)"` selects +one arm across both files): + +`session-lifecycle.e2e.test.ts` — one shared golden turn plus focused scenarios: +- **newSession config options** — model / effort selectors are offered. +- **working turn** — `initialize → newSession → prompt` (read a file, edit a + line, run a command): streamed assistant text, tool calls + a completed tool + call, the exact usage signal, `stopReason: end_turn`, the real on-disk file + edit, and (codex) the `_posthog/sdk_session` + `_posthog/turn_complete` + ext-notifications. +- **setSessionConfigOption** — switching a config option is accepted + acked. +- **interrupt** — `cancel` during an in-flight (unbounded) turn yields `cancelled`. +- **resumeSession** — reconnect returns config options. +- **loadSession** — a fresh connection reattaches and the transcript replays + (asserts the tool transcript replays, not just any update). + +Codex-only (advertised codex capabilities; registered as skipped on the claude +arm so the gap is visible): +- **mode switch** → `current_mode_update`. +- **steering** — a mid-turn prompt folds into the running turn via `turn/steer`. +- **list + fork** — `listSessions` finds the session; `forkSession` branches it. + +The command/file approval `{decision}` round-trip is **not** covered here: codex +spawns under a `danger-full-access` sandbox and auto-approves, so it never sends +an approval request to assert on. That envelope is covered by unit tests instead. + +`structured-output.e2e.test.ts` — `_meta.jsonSchema` + `onStructuredOutput` +delivers a parsed, schema-constrained object (the signals-pipeline contract). + +Assertions are structural lifecycle invariants + the deterministic file/JSON +side effects — never model prose — so they hold across adapters and cheap models. + +## Structure + +- `config.ts` — gateway/token/model resolution, per-adapter env wiring, skip logic. +- `driver.ts` — the in-process ACP host client (recording capture, auto-allow, + real FS), `openConnection` / `openSession` helpers, the throwaway-repo helpers, + and `waitFor`. +- `*.e2e.test.ts` — the scenarios. + +## Running + +These never run under `pnpm test` or per-PR CI (the default vitest config only +includes `src/**`). They are opt-in and cost a couple of short model turns. + +In CI they run as the **`e2e` job in `.github/workflows/test.yml`**, on pull +requests only, after the unit + integration jobs pass. The job is opt-in and safe +by default: it self-skips unless the repo variable `AGENT_E2E_ENABLED` is `true` +with an `E2E_GATEWAY_TOKEN` secret and an `E2E_GATEWAY_URL` variable pointing at a +gateway reachable from the runner, and it never runs for fork PRs (their secrets +are withheld, which would otherwise red the fail-loud token guard). Off by +default, so it costs nothing until explicitly enabled; the codex arm self-skips if +the native binary isn't on the runner. + +```bash +# from packages/agent — reads the local dev API key from the posthog repo, runs both arms +bash e2e/run-e2e.sh + +# just one adapter (matches the (codex) / (claude) marker in every title) +bash e2e/run-e2e.sh -t "(codex)" +``` + +Prereqs: a local llm-gateway up (`./bin/start` in the posthog repo) and the +native codex binary present at `apps/code/resources/codex-acp/codex` (the codex +arm self-skips if it is missing). + +## Configuration (env) + +| Var | Default | Notes | +| --- | --- | --- | +| `E2E_GATEWAY_TOKEN` | — | Required. A token the gateway accepts — the `llm_gateway` product takes a personal API key (no OAuth). Without it every arm skips. `run-e2e.sh` reads the local dev key. | +| `E2E_GATEWAY_URL` | `http://localhost:3308/llm_gateway` | Gateway base (codex appends `/v1`). `llm_gateway` accepts a personal API key; `posthog_code` is OAuth-only. | +| `E2E_CLAUDE_MODEL` | `claude-haiku-4-5` | Override if the gateway serves a different cheap Claude id. | +| `E2E_CODEX_MODEL` | `gpt-5-mini` | Cheapest codex id the local gateway serves; override if needed. | +| `POSTHOG_REPO` | sibling `../posthog` | Where `run-e2e.sh` reads the local dev key from. | +| `E2E_DEBUG` | — | `1` for verbose adapter logging. | + +If a default model isn't served by your gateway, the turn fails loudly (never a +false green) — set the matching `E2E_*_MODEL`. + +Each arm self-skips with a visible reason (missing token / missing binary) rather +than passing silently. diff --git a/packages/agent/e2e/compaction.e2e.test.ts b/packages/agent/e2e/compaction.e2e.test.ts new file mode 100644 index 0000000000..3a6653d1e4 --- /dev/null +++ b/packages/agent/e2e/compaction.e2e.test.ts @@ -0,0 +1,101 @@ +import { afterAll, beforeAll, describe, expect, it } from "vitest"; +import { type Adapter, E2E } from "./config"; +import { + cleanupRepo, + killCodexStragglers, + openSession, + setupRepo, +} from "./driver"; + +/** + * Live compaction e2e — codex only. codex auto-compacts when the context crosses + * `model_auto_compact_token_limit`; we spawn with a low limit and a big cheap input + * blob so a later turn trips it, and the adapter must surface `_posthog/compact_boundary`. + * Claude is excluded: its manual `/compact` hangs `prompt()` and forcing auto + * compaction is too costly. Tuning: if it never compacts, raise the limit and FILLER together. + */ +const ADAPTERS: Adapter[] = ["codex"]; + +// A limit above codex's resident baseline, with FILLER > limit so the crossing is baseline-independent. +const AUTO_COMPACT_TOKEN_LIMIT = 16000; +// ~20k tokens (~45 chars ≈ 11 tokens × 1800) — larger than the limit above. +const FILLER = "The quick brown fox jumps over the lazy dog. ".repeat(1800); +const MAX_CODEX_TURNS = 3; + +for (const adapter of ADAPTERS) { + const skip = E2E.skipReason(adapter); + const title = `compaction (${adapter})${skip ? ` — SKIPPED (${skip})` : ""}`; + + describe.skipIf(!!skip)(title, () => { + let repo: string; + + beforeAll(() => { + if (adapter === "codex") killCodexStragglers(); + E2E.configureEnv(adapter); + repo = setupRepo(); + }); + + afterAll(() => { + cleanupRepo(repo); + }); + + it("surfaces a compaction to the host via compact_boundary", async () => { + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: + adapter === "codex" + ? E2E.codexOptions(repo, { + // The model-scoped key is the effective one; set both to be safe. + model_auto_compact_token_limit: AUTO_COMPACT_TOKEN_LIMIT, + auto_compact_token_limit: AUTO_COMPACT_TOKEN_LIMIT, + }) + : undefined, + meta: { + systemPrompt: "You are a coding assistant in a tiny test repo.", + model: E2E.model(adapter), + permissionMode: "bypassPermissions", + taskRunId: "e2e-compaction", + }, + }); + try { + const compacted = () => + s.capture.extMethods().includes("_posthog/compact_boundary"); + + if (adapter === "claude") { + // A little conversation, then the cheap deterministic trigger: manual /compact. + await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [{ type: "text", text: "Reply with only: hello." }], + }); + await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [{ type: "text", text: "/compact" }], + }); + } else { + // codex: turn 1's big input blob fills the context past the limit; turn 2+ + // trips auto-compaction. Stop once the boundary is surfaced. + for (let i = 0; i < MAX_CODEX_TURNS && !compacted(); i++) { + const text = + i === 0 + ? `Reference text — do not summarize, reply with only: OK.\n\n${FILLER}` + : "Reply with only: DONE."; + await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [{ type: "text", text }], + }); + } + } + + expect( + compacted(), + `expected a _posthog/compact_boundary; saw methods: ${s.capture + .extMethods() + .join(", ")}`, + ).toBe(true); + } finally { + await s.cleanup(); + } + }, 300_000); + }); +} diff --git a/packages/agent/e2e/config.ts b/packages/agent/e2e/config.ts new file mode 100644 index 0000000000..0670dd82eb --- /dev/null +++ b/packages/agent/e2e/config.ts @@ -0,0 +1,106 @@ +import { existsSync } from "node:fs"; +import { join } from "node:path"; + +export type Adapter = "claude" | "codex"; + +/** + * Live e2e configuration, resolved entirely from the environment so no secret is + * committed. Needs a local llm-gateway and a token in `E2E_GATEWAY_TOKEN`; targets + * the `llm_gateway` product, which accepts a personal API key (no OAuth mint, + * unlike prod's `posthog_code`). Without the token every arm self-skips. + */ +// `||` not `??`: CI sets unset vars to "" which should fall back to the default. +const GATEWAY_URL = + process.env.E2E_GATEWAY_URL || "http://localhost:3308/llm_gateway"; +const TOKEN = process.env.E2E_GATEWAY_TOKEN ?? ""; + +// The native app-server binary, relative to packages/agent/e2e. +const NATIVE_CODEX_BIN = join( + __dirname, + "..", + "..", + "..", + "apps", + "code", + "resources", + "codex-acp", + "codex", +); + +/** The gateway base with a trailing `/v1` (codex / OpenAI-format endpoint). */ +function openAiBase(): string { + return GATEWAY_URL.endsWith("/v1") ? GATEWAY_URL : `${GATEWAY_URL}/v1`; +} + +export const E2E = { + token: TOKEN, + hasToken: !!TOKEN, + gatewayUrl: GATEWAY_URL, + codexBin: NATIVE_CODEX_BIN, + /** Deployment environment. `E2E_ENVIRONMENT=cloud` exercises the cloud code path; undefined = local. */ + environment: + (process.env.E2E_ENVIRONMENT as "local" | "cloud" | undefined) || undefined, + + /** Cheap model per adapter, overridable via `E2E_CLAUDE_MODEL` / `E2E_CODEX_MODEL`. */ + model(adapter: Adapter): string { + // `||` so an empty CI variable falls back to the default. + if (adapter === "claude") { + return process.env.E2E_CLAUDE_MODEL || "claude-haiku-4-5"; + } + // gpt-5-mini is on the product block list, but that gate is only enforced in + // Agent.run — the e2e drives createAcpConnection directly, so it's accepted. + return process.env.E2E_CODEX_MODEL || "gpt-5-mini"; + }, + + /** Null => runnable; a string => skip this arm with that reason (never silent). */ + skipReason(adapter: Adapter): string | null { + if (!TOKEN) return "E2E_GATEWAY_TOKEN not set"; + if (adapter === "codex" && !existsSync(NATIVE_CODEX_BIN)) { + return `native codex binary missing at ${NATIVE_CODEX_BIN}`; + } + return null; + }, + + /** Point the adapter at the gateway as the host's `configureEnvironment` does. */ + configureEnv(adapter: Adapter): void { + if (adapter === "claude") { + process.env.ANTHROPIC_BASE_URL = GATEWAY_URL; + process.env.ANTHROPIC_AUTH_TOKEN = TOKEN; + return; + } + process.env.OPENAI_BASE_URL = openAiBase(); + process.env.OPENAI_API_KEY = TOKEN; + process.env.POSTHOG_CODEX_USE_APP_SERVER = "1"; + }, + + /** The codexOptions the codex arm passes through `createAcpConnection`. */ + codexOptions( + cwd: string, + configOverrides?: Record, + modelOverride?: string, + ): { + cwd: string; + binaryPath: string; + apiBaseUrl: string; + apiKey: string; + model: string; + configOverrides?: Record; + } { + return { + cwd, + binaryPath: NATIVE_CODEX_BIN, + apiBaseUrl: openAiBase(), + apiKey: TOKEN, + model: modelOverride || this.model("codex"), + ...(configOverrides ? { configOverrides } : {}), + }; + }, + + /** A stronger model for tests the cheapest models can't handle (e.g. structured-output decodes). */ + strongModel(adapter: Adapter): string { + if (adapter === "claude") { + return process.env.E2E_CLAUDE_MODEL || "claude-sonnet-4-5"; + } + return process.env.E2E_CODEX_MODEL || "gpt-5.5"; + }, +}; diff --git a/packages/agent/e2e/driver.ts b/packages/agent/e2e/driver.ts new file mode 100644 index 0000000000..7110e6f40a --- /dev/null +++ b/packages/agent/e2e/driver.ts @@ -0,0 +1,287 @@ +/** + * Adapter-agnostic ACP driver for the live e2e suite. Stands up the same in-process + * ACP transport the real host uses and drives a real adapter + binary + gateway. + * The only thing mocked is the host/UI client (recording sessionUpdate, auto-allow + * requestPermission, real fs read/write against the test repo). + */ +import { execFileSync } from "node:child_process"; +import { + promises as fsp, + mkdtempSync, + readFileSync, + realpathSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; +// @ts-expect-error - runtime ESM export resolved by vitest +import { ClientSideConnection, ndJsonStream } from "@agentclientprotocol/sdk"; +import { createAcpConnection } from "../src/adapters/acp-connection"; +import { Logger } from "../src/utils/logger"; +import { type Adapter, E2E } from "./config"; + +export type { Adapter } from "./config"; + +export interface CapturedEvent { + kind: "sessionUpdate" | "requestPermission" | "extNotification"; + sessionUpdate?: string; + method?: string; + data?: Record; +} + +export interface Capture { + events: CapturedEvent[]; + updates(type: string): CapturedEvent[]; + approvals(): CapturedEvent[]; + extMethods(): string[]; +} + +export interface NewSessionResponse { + sessionId: string; + configOptions?: ConfigOption[]; + modes?: unknown; +} + +export interface ConfigOption { + id?: string; + category?: string; + currentValue?: unknown; + options?: Array<{ name?: string; value?: unknown }>; +} + +export interface AcpConn { + initialize: (p: unknown) => Promise; + newSession: (p: unknown) => Promise; + loadSession: (p: unknown) => Promise; + resumeSession: (p: unknown) => Promise; + listSessions: ( + p: unknown, + ) => Promise<{ sessions?: Array<{ sessionId?: string }> }>; + unstable_forkSession: (p: unknown) => Promise; + prompt: (p: unknown) => Promise<{ stopReason?: string; usage?: unknown }>; + setSessionConfigOption: (p: unknown) => Promise; + cancel: (p: unknown) => Promise; + // Client→agent ext-method (the host drives _posthog/refresh_session). + extMethod: (method: string, params: unknown) => Promise; +} + +export interface E2EConnection { + conn: AcpConn; + capture: Capture; + cleanup: () => Promise; +} + +/** + * The ACP `initialize` params our host client sends. Matches the cloud host, which + * advertises no clientCapabilities — so the adapter runs file/terminal tools + * in-process rather than proxying through the host's fs callbacks. + */ +export const INIT_PARAMS = { + protocolVersion: 1, + clientCapabilities: {}, +}; + +export function openConnection(opts: { + adapter: Adapter; + cwd: string; + codexOptions?: Record; + onStructuredOutput?: (output: Record) => Promise; +}): E2EConnection { + const { adapter, cwd } = opts; + const events: CapturedEvent[] = []; + + // Mirror the cloud host's client surface. Deliberately no extMethod: the real + // host doesn't implement it, so an adapter calling it should fail e2e as in prod. + const client = { + async sessionUpdate(p: any): Promise { + events.push({ + kind: "sessionUpdate", + sessionUpdate: p?.update?.sessionUpdate, + data: p?.update, + }); + }, + async requestPermission(p: any): Promise { + events.push({ + kind: "requestPermission", + data: { + title: p?.toolCall?.title, + kind: p?.toolCall?.kind, + // request_user_input surfaces as a permission with codeToolKind: "question"; codex only offers it in Plan mode. + codeToolKind: p?.toolCall?._meta?.codeToolKind, + }, + }); + const options = p?.options ?? []; + const allow = + options.find( + (o: any) => o?.kind === "allow_once" || o?.kind === "allow_always", + ) ?? options[0]; + return { + outcome: { outcome: "selected", optionId: allow?.optionId ?? "allow" }, + }; + }, + async readTextFile(p: any): Promise { + return { content: await fsp.readFile(resolve(cwd, p.path), "utf8") }; + }, + async writeTextFile(p: any): Promise { + await fsp.writeFile(resolve(cwd, p.path), p.content); + return {}; + }, + async extNotification(method: string, params: any): Promise { + events.push({ kind: "extNotification", method, data: params }); + }, + }; + + const logger = new Logger({ + debug: !!process.env.E2E_DEBUG, + prefix: "[e2e]", + }); + const acp = createAcpConnection({ + adapter, + codexOptions: opts.codexOptions as any, + onStructuredOutput: opts.onStructuredOutput, + logger, + }); + const stream = ndJsonStream( + acp.clientStreams.writable, + acp.clientStreams.readable, + ); + const conn = new ClientSideConnection( + () => client, + stream, + ) as unknown as AcpConn; + + const capture: Capture = { + events, + updates: (type) => + events.filter( + (e) => e.kind === "sessionUpdate" && e.sessionUpdate === type, + ), + approvals: () => events.filter((e) => e.kind === "requestPermission"), + extMethods: () => [ + ...new Set( + events + .filter((e) => e.kind === "extNotification" && e.method) + .map((e) => e.method as string), + ), + ], + }; + + return { + conn, + capture, + cleanup: async () => { + // Bounded: a wedged adapter cleanup must never hang the suite. + await Promise.race([ + acp.cleanup().catch(() => undefined), + new Promise((r) => setTimeout(r, 8000)), + ]); + }, + }; +} + +export interface OpenSession { + conn: AcpConn; + capture: Capture; + sessionId: string; + newSession: NewSessionResponse; + cleanup: () => Promise; +} + +/** openConnection + initialize + newSession — the common scenario setup. */ +export async function openSession(opts: { + adapter: Adapter; + cwd: string; + codexOptions?: Record; + onStructuredOutput?: (output: Record) => Promise; + meta: Record; +}): Promise { + const c = openConnection(opts); + await c.conn.initialize(INIT_PARAMS); + const newSession = await c.conn.newSession({ + cwd: opts.cwd, + mcpServers: [], + // Inject E2E_ENVIRONMENT so the suite can run as a cloud session without threading it through every test's meta. + _meta: { + ...opts.meta, + ...(E2E.environment ? { environment: E2E.environment } : {}), + }, + }); + return { + conn: c.conn, + capture: c.capture, + sessionId: newSession.sessionId, + newSession, + cleanup: c.cleanup, + }; +} + +export const ORIGINAL_TARGET = "line1\nline2\nline3\n"; + +export function setupRepo(): string { + // realpath so cwd is canonical: on macOS os.tmpdir() is a symlink. The Claude + // SDK keys its session store by the resolved path, so loadSession's replay finds + // nothing if a fresh connection uses a different path. + const repo = realpathSync(mkdtempSync(join(tmpdir(), "agent-e2e-"))); + writeFileSync(join(repo, "target.txt"), ORIGINAL_TARGET); + execFileSync("git", ["init", "-q"], { cwd: repo }); + execFileSync("git", ["add", "-A"], { cwd: repo }); + // -c commit.gpgsign=false: ignore the user's global signing config, which fails in this non-interactive context. + execFileSync( + "git", + [ + "-c", + "commit.gpgsign=false", + "-c", + "user.email=e2e@posthog.dev", + "-c", + "user.name=e2e", + "commit", + "-qm", + "init", + ], + { cwd: repo }, + ); + return repo; +} + +export function readTarget(repo: string): string { + return readFileSync(join(repo, "target.txt"), "utf8"); +} + +export function cleanupRepo(repo: string): void { + try { + rmSync(repo, { recursive: true, force: true }); + } catch { + /* best effort */ + } +} + +/** Poll `fn` until it returns a non-undefined value or the timeout elapses. */ +export async function waitFor( + fn: () => T | undefined, + timeoutMs = 5000, + intervalMs = 100, +): Promise { + const start = Date.now(); + for (;;) { + const value = fn(); + if (value !== undefined) return value; + if (Date.now() - start >= timeoutMs) return undefined; + await new Promise((r) => setTimeout(r, intervalMs)); + } +} + +/** + * codex spawns detached; a killed run can orphan it holding a flock under + * ~/.codex/tmp, wedging the next run. Kill stragglers first to release the flock. + */ +export function killCodexStragglers(): void { + try { + execFileSync("pkill", ["-9", "-f", "resources/codex-acp"], { + stdio: "ignore", + }); + } catch { + /* none running */ + } +} diff --git a/packages/agent/e2e/guard.e2e.test.ts b/packages/agent/e2e/guard.e2e.test.ts new file mode 100644 index 0000000000..bf859a4969 --- /dev/null +++ b/packages/agent/e2e/guard.e2e.test.ts @@ -0,0 +1,30 @@ +import { describe, expect, it } from "vitest"; +import { E2E } from "./config"; + +/** + * Fail-loud precondition for the live e2e suite. Without E2E_GATEWAY_TOKEN every + * arm self-skips and `vitest run` exits 0 — a green run that tested nothing. This + * one non-skipped test turns a missing token into a RED run. + */ +describe("live e2e preconditions", () => { + it("requires E2E_GATEWAY_TOKEN (else the suite would skip-to-green)", () => { + expect( + E2E.hasToken, + "E2E_GATEWAY_TOKEN is not set — every adapter arm would skip and the run " + + "would pass without testing anything. Mint one via e2e/run-e2e.sh or " + + "set E2E_GATEWAY_TOKEN against a reachable E2E_GATEWAY_URL.", + ).toBe(true); + }); + + // When a token is present, the codex arm must not skip silently — a missing + // binary would let the run pass with zero codex coverage. + it("requires the native codex binary when a token is set (else codex skips-to-green)", () => { + if (!E2E.hasToken) return; // no token → whole suite skips; nothing to guard + expect( + E2E.skipReason("codex"), + "E2E_GATEWAY_TOKEN is set but the native codex binary is missing — the " + + "codex arm would silently skip and the run would pass without exercising " + + "the codex adapter. Ensure apps/code/scripts/download-binaries.mjs ran.", + ).toBeNull(); + }); +}); diff --git a/packages/agent/e2e/run-e2e.sh b/packages/agent/e2e/run-e2e.sh new file mode 100755 index 0000000000..37e18e4054 --- /dev/null +++ b/packages/agent/e2e/run-e2e.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Run the live golden-path e2e for both adapters (claude + codex). +# +# Needs a local llm-gateway (run `./bin/start` in the posthog repo) and a token. +# The suite targets the gateway's `llm_gateway` product, which accepts a personal +# API key (no OAuth), so if E2E_GATEWAY_TOKEN is unset this reads the repo's +# hardcoded local dev key from ee/settings.py (override the repo with POSTHOG_REPO). +# That key must be registered in the local DB — run `python manage.py +# setup_local_api_key` in the posthog repo once if auth fails. +# +# Usage: +# bash e2e/run-e2e.sh # both adapters, both suites +# bash e2e/run-e2e.sh -t "(codex)" # only the codex arm (vitest -t name filter) +# Env overrides: E2E_GATEWAY_URL, E2E_CLAUDE_MODEL, E2E_CODEX_MODEL, E2E_DEBUG=1 +set -euo pipefail + +HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +AGENT_DIR="$(cd "$HERE/.." && pwd)" +POSTHOG_REPO="${POSTHOG_REPO:-$(cd "$AGENT_DIR/../../.." && pwd)/posthog}" + +if [[ -z "${E2E_GATEWAY_TOKEN:-}" ]]; then + SETTINGS="$POSTHOG_REPO/ee/settings.py" + if [[ ! -f "$SETTINGS" ]]; then + echo "E2E_GATEWAY_TOKEN unset and posthog settings not found at $SETTINGS." >&2 + echo "Set E2E_GATEWAY_TOKEN, or POSTHOG_REPO to the posthog checkout." >&2 + exit 1 + fi + # The `llm_gateway` product accepts personal API keys, so no OAuth mint needed. + E2E_GATEWAY_TOKEN="$(grep -E '^DEV_API_KEY[[:space:]]*=' "$SETTINGS" | head -1 | sed -E 's/^DEV_API_KEY[[:space:]]*=[[:space:]]*"([^"]+)".*/\1/')" +fi + +if [[ -z "${E2E_GATEWAY_TOKEN:-}" ]]; then + echo "Failed to obtain an E2E_GATEWAY_TOKEN (no DEV_API_KEY in ee/settings.py?)." >&2 + echo "If auth then fails, run 'python manage.py setup_local_api_key' in the posthog repo." >&2 + exit 1 +fi + +export E2E_GATEWAY_TOKEN +echo "token: ${E2E_GATEWAY_TOKEN:0:8}… gateway: ${E2E_GATEWAY_URL:-http://localhost:3308/llm_gateway}" +cd "$AGENT_DIR" +pnpm test:e2e "$@" diff --git a/packages/agent/e2e/session-lifecycle.e2e.test.ts b/packages/agent/e2e/session-lifecycle.e2e.test.ts new file mode 100644 index 0000000000..4de1acf3e5 --- /dev/null +++ b/packages/agent/e2e/session-lifecycle.e2e.test.ts @@ -0,0 +1,572 @@ +import { afterAll, beforeAll, describe, expect, it } from "vitest"; +import { type Adapter, E2E } from "./config"; +import { + type Capture, + type ConfigOption, + cleanupRepo, + INIT_PARAMS, + killCodexStragglers, + type NewSessionResponse, + ORIGINAL_TARGET, + openConnection, + openSession, + readTarget, + setupRepo, + waitFor, +} from "./driver"; + +/** + * Live session-lifecycle e2e per adapter: drives a real session end to end against + * the real gateway + binary on a cheap model. Assertions are structural lifecycle + * invariants + the on-disk edit, never model prose. Opt-in: each arm self-skips + * unless `E2E_GATEWAY_TOKEN` is set (codex also needs the native binary). + */ +const ADAPTERS: Adapter[] = ["claude", "codex"]; + +const EDIT_PROMPT = + "Do exactly these steps and nothing else: 1) Read the file target.txt. " + + "2) Edit it so the second line reads FOO instead of line2. " + + "3) Run the shell command `cat target.txt`. " + + "4) In one sentence confirm what you changed, then stop."; + +for (const adapter of ADAPTERS) { + const skip = E2E.skipReason(adapter); + const title = `session lifecycle (${adapter})${skip ? ` — SKIPPED (${skip})` : ""}`; + // Codex-only; skipped on the claude arm so the gap is visible. + const itCodex = adapter === "codex" ? it : it.skip; + // Read-only profile only tightens per-turn on macOS + non-cloud (elsewhere the + // spawn is danger-full-access / no profile), so gate to where it actually applies. + const itCodexSandbox = + adapter === "codex" && + process.platform === "darwin" && + E2E.environment !== "cloud" + ? it + : it.skip; + + describe.skipIf(!!skip)(title, () => { + let repo: string; + const codexOptions = () => + adapter === "codex" ? E2E.codexOptions(repo) : undefined; + const meta = (extra: Record = {}) => ({ + systemPrompt: "You are a coding assistant in a tiny test repo.", + model: E2E.model(adapter), + permissionMode: "bypassPermissions", + // Drives the cloud ext-notifications (_posthog/sdk_session + turn_complete). + taskRunId: "e2e-run", + ...extra, + }); + + let sessionId: string; + let newSessionResponse: NewSessionResponse; + let turn: + | { stopReason?: string; capture: Capture; target: string } + | undefined; + let goldenError: unknown; + + beforeAll(async () => { + if (adapter === "codex") killCodexStragglers(); + E2E.configureEnv(adapter); + repo = setupRepo(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + sessionId = s.sessionId; + newSessionResponse = s.newSession; + try { + const res = await s.conn.prompt({ + sessionId, + prompt: [{ type: "text", text: EDIT_PROMPT }], + }); + turn = { + stopReason: res.stopReason, + capture: s.capture, + target: readTarget(repo), + }; + } catch (err) { + // Don't fail the whole describe on a flaky golden turn — record it so only + // the test that consumes `turn` fails. + goldenError = err; + } finally { + await s.cleanup(); + } + }, 180_000); + + afterAll(() => { + cleanupRepo(repo); + }); + + it("newSession exposes selectable config options (model / effort)", () => { + const opts = newSessionResponse.configOptions ?? []; + expect(opts.length).toBeGreaterThan(0); + expect(opts.some((o) => (o.options?.length ?? 0) > 1)).toBe(true); + }); + + it("streams a working turn: assistant text, tool calls, usage, file edit", () => { + if (goldenError) throw goldenError; + if (!turn) throw new Error("golden turn did not produce a result"); + expect(turn.stopReason).toBe("end_turn"); + expect( + turn.capture.updates("agent_message_chunk").length, + ).toBeGreaterThan(0); + expect(turn.capture.updates("tool_call").length).toBeGreaterThan(0); + const anyToolCompleted = [ + ...turn.capture.updates("tool_call"), + ...turn.capture.updates("tool_call_update"), + ].some((e) => e.data?.status === "completed"); + expect(anyToolCompleted).toBe(true); + + const hasUsage = + turn.capture.updates("usage_update").length > 0 || + turn.capture.extMethods().includes("_posthog/usage_update"); + expect(hasUsage).toBe(true); + + expect(turn.capture.extMethods()).toContain("_posthog/sdk_session"); + + expect(turn.target).not.toBe(ORIGINAL_TARGET); + expect(turn.target).toContain("FOO"); + + // codex additionally emits turn_complete; claude signals completion via the prompt response. + if (adapter === "codex") { + // Reasoning parity is unit-covered (mapping.test.ts); a live assertion + // would be flaky on the cheap model. + expect(turn.capture.extMethods()).toContain("_posthog/turn_complete"); + const tc = turn.capture.events.find( + (e) => + e.kind === "extNotification" && + e.method === "_posthog/turn_complete", + ); + const usage = (tc?.data as { usage?: Record })?.usage; + expect(usage).toBeTruthy(); + expect(usage?.totalTokens ?? 0).toBeGreaterThan(0); + expect(usage?.totalTokens).toBe( + (usage?.inputTokens ?? 0) + + (usage?.outputTokens ?? 0) + + (usage?.cachedReadTokens ?? 0) + + (usage?.cachedWriteTokens ?? 0), + ); + } + }); + + it("switches a config option via setSessionConfigOption", async () => { + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + try { + const opt = (s.newSession.configOptions ?? []).find( + (o) => (o.options?.length ?? 0) > 1, + ); + expect( + opt, + "expected a config option with multiple values", + ).toBeTruthy(); + const alt = + opt?.options?.find((v) => v.value !== opt.currentValue) ?? + opt?.options?.[0]; + const res = await s.conn.setSessionConfigOption({ + sessionId: s.sessionId, + configId: opt?.id, + value: alt?.value, + }); + expect(res).toBeTruthy(); + if (adapter === "codex") { + // codex re-emits config_option_update as the side effect of a switch. + expect( + s.capture.updates("config_option_update").length, + ).toBeGreaterThan(0); + } else { + // claude returns updated configOptions — assert the switch actually took, + // not merely that an ack array was produced (unconditionally true). + const updated = ((res?.configOptions ?? []) as ConfigOption[]).find( + (o) => o.id === opt?.id, + ); + expect(updated?.currentValue).toBe(alt?.value); + } + } finally { + await s.cleanup(); + } + }, 90_000); + + // Cloud host switches mode only via setSessionConfigOption(configId:"mode"), so exercise both arms. + it("emits current_mode_update when the mode is switched via setSessionConfigOption", async () => { + if (adapter === "codex") killCodexStragglers(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + try { + // codex synthesizes modes; claude exposes a "mode" configOption — pick an alternate value. + let value = "read-only"; + if (adapter === "claude") { + const modeOpt = (s.newSession.configOptions ?? []).find( + (o) => o.id === "mode", + ); + value = + (modeOpt?.options?.find((v) => v.value !== modeOpt.currentValue) + ?.value as string) ?? "plan"; + } + await s.conn.setSessionConfigOption({ + sessionId: s.sessionId, + configId: "mode", + value, + }); + expect(s.capture.updates("current_mode_update").length).toBeGreaterThan( + 0, + ); + } finally { + await s.cleanup(); + } + }, 60_000); + + // Proves the mode picker isn't cosmetic: read-only maps to an OS-level + // :read-only profile that blocks the write even though the host auto-approves. + // macOS-only (see itCodexSandbox). + itCodexSandbox( + "read-only mode actually blocks a file edit (sandbox restricts, not just approval)", + async () => { + if (adapter === "codex") killCodexStragglers(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + try { + await s.conn.setSessionConfigOption({ + sessionId: s.sessionId, + configId: "mode", + value: "read-only", + }); + const before = readTarget(repo); + const res = await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [ + { + type: "text", + text: + "Use your file-editing tool to change target.txt so its second " + + "line reads SENTINEL_RO_EDIT. You MUST attempt the edit with your " + + "tool even if it appears restricted. Then stop.", + }, + ], + }); + expect(res.stopReason).toBeTruthy(); + // >=1 tool call, so a pure prose no-op can't masquerade as enforcement. + expect(s.capture.updates("tool_call").length).toBeGreaterThan(0); + // File unchanged: the read-only sandbox blocked the write despite host auto-approval. + expect(readTarget(repo)).toBe(before); + expect(readTarget(repo)).not.toContain("SENTINEL_RO_EDIT"); + } finally { + await s.cleanup(); + } + }, + 180_000, + ); + + // Proves Plan is a real mode: codex only offers request_user_input in its plan + // collaboration mode. Also covers the revert — the collaboration mode is sticky, + // so switching back to auto must push default explicitly. + itCodex( + "plan mode engages codex's plan collaboration, and reverts when switched back to auto", + async () => { + if (adapter === "codex") killCodexStragglers(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + const askToUseTool = + "Before doing anything else, you MUST call the request_user_input tool " + + "to ask the user a single question: whether to proceed with approach A " + + "or approach B. Ask exactly that one question via the tool, then stop."; + const questionCount = () => + s.capture + .approvals() + .filter((e) => e.data?.codeToolKind === "question").length; + try { + await s.conn.setSessionConfigOption({ + sessionId: s.sessionId, + configId: "mode", + value: "plan", + }); + await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [{ type: "text", text: askToUseTool }], + }); + const afterPlan = questionCount(); + expect(afterPlan).toBeGreaterThan(0); + + // Switch back to auto: request_user_input is gone, so the same prompt yields no new question. + await s.conn.setSessionConfigOption({ + sessionId: s.sessionId, + configId: "mode", + value: "auto", + }); + await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [{ type: "text", text: askToUseTool }], + }); + expect(questionCount()).toBe(afterPlan); + } finally { + await s.cleanup(); + } + }, + 240_000, + ); + + it("handles the host's refresh_session extMethod per adapter", async () => { + if (adapter === "codex") killCodexStragglers(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + try { + const call = s.conn.extMethod("_posthog/refresh_session", { + mcpServers: [], + }); + if (adapter === "claude") { + // claude implements refresh_session; haiku is on the MCP-injection exclude + // list, so it rejects on the model gate (not method-not-found), proving the + // call reaches the handler. + await expect(call).rejects.toThrow(/MCP injection/i); + } else { + // codex doesn't implement extMethod — the call rejects cleanly (known adapter divergence). + await expect(call).rejects.toThrow(); + } + } finally { + await s.cleanup(); + } + }, 60_000); + + // Known gap: the approval {decision} round-trip and requestPermission policy + // aren't exercised here (codex auto-approves under danger-full-access) — + // unit-covered in codex-app-server-agent.test.ts / approvals.test.ts. + + it("incorporates a prompt's _meta.prContext without error", async () => { + if (adapter === "codex") killCodexStragglers(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + try { + // The host attaches prContext on PR-follow-up runs; both adapters prepend it. + const res = await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [ + { + type: "text", + text: "Acknowledge the linked pull request in one short sentence, then stop.", + }, + ], + _meta: { + prContext: + "Context: PR #4242 'Fix the thing' is open and under review.", + }, + }); + expect(res.stopReason).toBe("end_turn"); + expect(s.capture.updates("agent_message_chunk").length).toBeGreaterThan( + 0, + ); + } finally { + await s.cleanup(); + } + }, 120_000); + + itCodex( + "folds a mid-turn prompt into the running turn via steering", + async () => { + killCodexStragglers(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + try { + const p1 = s.conn.prompt({ + sessionId: s.sessionId, + prompt: [ + { + type: "text", + text: "Count up from 1, one number per line, and keep going.", + }, + ], + }); + await waitFor( + () => + s.capture.updates("agent_message_chunk").length > 0 + ? true + : undefined, + 20_000, + ); + const p2 = s.conn.prompt({ + sessionId: s.sessionId, + prompt: [{ type: "text", text: "Now stop and say DONE." }], + }); + const [r1] = await Promise.all([p1, p2]); + expect(r1.stopReason).toBe("end_turn"); + expect( + s.capture.updates("user_message_chunk").length, + ).toBeGreaterThanOrEqual(2); + // The steer proof: folded into a SINGLE turn (one turn_complete). Two would + // mean the steer didn't take and p2 ran as its own turn. + const turnCompletes = s.capture.events.filter( + (e) => + e.kind === "extNotification" && + e.method === "_posthog/turn_complete", + ).length; + expect( + turnCompletes, + "expected the steered prompt to fold into one running turn (1 " + + "turn_complete); 2 means the steer didn't take", + ).toBe(1); + } finally { + await s.cleanup(); + } + }, + 120_000, + ); + + itCodex( + "lists the session and forks it", + async () => { + killCodexStragglers(); + const b = openConnection({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + }); + try { + await b.conn.initialize(INIT_PARAMS); + const listed = await b.conn.listSessions({ cwd: repo }); + const ids = (listed.sessions ?? []).map((x) => x.sessionId); + expect(ids).toContain(sessionId); + const forked = await b.conn.unstable_forkSession({ + sessionId, + cwd: repo, + mcpServers: [], + _meta: { model: E2E.model(adapter) }, + }); + expect(forked.sessionId).toBeTruthy(); + expect(forked.sessionId).not.toBe(sessionId); + } finally { + await b.cleanup(); + } + }, + 60_000, + ); + + // Known gap: the permission DENY path isn't exercised (neither arm reliably + // surfaces a deny-able approval to a cheap model) — unit-covered in + // approvals.test.ts / codex-app-server-agent.test.ts. + + it("interrupts an in-flight turn", async () => { + if (adapter === "codex") killCodexStragglers(); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + meta: meta(), + }); + try { + const p = s.conn.prompt({ + sessionId: s.sessionId, + prompt: [ + { + type: "text", + text: "Count up from 1, one number per line, and never stop until told to.", + }, + ], + }); + // Cancel as soon as the turn is in flight (unbounded work, so no race). + await waitFor( + () => + s.capture.updates("agent_message_chunk").length > 0 || + s.capture.updates("tool_call").length > 0 + ? true + : undefined, + 20_000, + ); + await s.conn.cancel({ sessionId: s.sessionId }); + const res = await p; + expect(res.stopReason).toBe("cancelled"); + + // After a cancel the session must be usable again — a bounded follow-up must complete. + const followUp = await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [{ type: "text", text: "Stop. Reply with just: OK" }], + }); + expect(followUp.stopReason).toBe("end_turn"); + } finally { + await s.cleanup(); + } + }, 120_000); + + it("resumeSession reconnects and returns config options", async () => { + if (adapter === "codex") killCodexStragglers(); + const b = openConnection({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + }); + try { + await b.conn.initialize(INIT_PARAMS); + const resumed = await b.conn.resumeSession({ + sessionId, + cwd: repo, + mcpServers: [], + _meta: { model: E2E.model(adapter) }, + }); + expect(resumed).toBeTruthy(); + expect(Array.isArray(resumed.configOptions)).toBe(true); + } finally { + await b.cleanup(); + } + }, 60_000); + + it("reattach (loadSession) restores the session and replays the transcript", async () => { + if (adapter === "codex") killCodexStragglers(); + const b = openConnection({ + adapter, + cwd: repo, + codexOptions: codexOptions(), + }); + try { + await b.conn.initialize(INIT_PARAMS); + const loaded = await b.conn.loadSession({ + sessionId, + cwd: repo, + mcpServers: [], + _meta: { model: E2E.model(adapter) }, + }); + expect(loaded).toBeTruthy(); + // loadSession runs no turn, so any update here is replayed history. The + // shape differs by adapter: codex replays message chunks, claude tool calls. + const replayed = await waitFor(() => { + const n = + adapter === "codex" + ? b.capture.updates("user_message_chunk").length + + b.capture.updates("agent_message_chunk").length + : b.capture.updates("tool_call").length + + b.capture.updates("tool_call_update").length; + return n > 0 ? n : undefined; + }, 8000); + expect(replayed ?? 0).toBeGreaterThan(0); + } finally { + await b.cleanup(); + } + }, 60_000); + }); +} diff --git a/packages/agent/e2e/structured-output.e2e.test.ts b/packages/agent/e2e/structured-output.e2e.test.ts new file mode 100644 index 0000000000..05cddc1003 --- /dev/null +++ b/packages/agent/e2e/structured-output.e2e.test.ts @@ -0,0 +1,85 @@ +import { afterAll, beforeAll, describe, expect, it } from "vitest"; +import { type Adapter, E2E } from "./config"; +import { + cleanupRepo, + killCodexStragglers, + openSession, + setupRepo, +} from "./driver"; + +/** + * Live structured-output e2e: both adapters constrain the final message to a JSON + * schema (`_meta.jsonSchema`) and deliver the parsed object via `onStructuredOutput` + * — the contract the signals pipeline relies on. Deterministic answer so a cheap + * model passes reliably. Opt-in (same gating as the lifecycle suite). + */ +const ADAPTERS: Adapter[] = ["claude", "codex"]; + +const SCHEMA = { + type: "object", + properties: { capital: { type: "string" } }, + required: ["capital"], + additionalProperties: false, +}; + +for (const adapter of ADAPTERS) { + const skip = E2E.skipReason(adapter); + const title = `structured output (${adapter})${skip ? ` — SKIPPED (${skip})` : ""}`; + + describe.skipIf(!!skip)(title, () => { + let repo: string; + + beforeAll(() => { + if (adapter === "codex") killCodexStragglers(); + E2E.configureEnv(adapter); + repo = setupRepo(); + }); + + afterAll(() => { + cleanupRepo(repo); + }); + + it("delivers schema-constrained structured output", async () => { + let captured: Record | undefined; + // The cheapest models hang on the constrained decode; use a stronger one. + const model = E2E.strongModel(adapter); + const s = await openSession({ + adapter, + cwd: repo, + codexOptions: + adapter === "codex" + ? E2E.codexOptions(repo, undefined, model) + : undefined, + onStructuredOutput: async (o) => { + captured = o; + }, + meta: { + systemPrompt: "You answer strictly with JSON matching the schema.", + model, + permissionMode: "bypassPermissions", + jsonSchema: SCHEMA, + // Prod always sets taskRunId — exercise structured output + the session ext-notification together. + taskRunId: "e2e-structured", + }, + }); + try { + const res = await s.conn.prompt({ + sessionId: s.sessionId, + prompt: [ + { + type: "text", + text: "What is the capital of France? Answer using the required JSON schema.", + }, + ], + }); + expect(res.stopReason).toBe("end_turn"); + expect(captured, "onStructuredOutput should fire").toBeTruthy(); + expect(typeof captured?.capital).toBe("string"); + expect((captured?.capital as string).toLowerCase()).toContain("paris"); + expect(s.capture.extMethods()).toContain("_posthog/sdk_session"); + } finally { + await s.cleanup(); + } + }, 120_000); + }); +} diff --git a/packages/agent/package.json b/packages/agent/package.json index 1ccc632d6e..43d4a980f2 100644 --- a/packages/agent/package.json +++ b/packages/agent/package.json @@ -108,6 +108,7 @@ "dev": "tsup --watch", "test": "vitest run", "test:watch": "vitest", + "test:e2e": "vitest run --config vitest.e2e.config.ts", "typecheck": "pnpm exec tsc --noEmit", "prepublishOnly": "pnpm run build", "clean": "node ../../scripts/rimraf.mjs dist .turbo" @@ -132,6 +133,7 @@ "@anthropic-ai/claude-agent-sdk": "0.3.170", "@anthropic-ai/sdk": "0.104.1", "@hono/node-server": "^1.19.9", + "@openai/codex": "0.140.0", "@opentelemetry/api-logs": "^0.208.0", "@opentelemetry/exporter-logs-otlp-http": "^0.208.0", "@opentelemetry/resources": "^2.0.0", diff --git a/packages/agent/parity/harness.ts b/packages/agent/parity/harness.ts new file mode 100644 index 0000000000..0132b1e7a8 --- /dev/null +++ b/packages/agent/parity/harness.ts @@ -0,0 +1,242 @@ +/** + * Differential parity harness for the two Codex adapters. + * + * Drives a scripted scenario (a stateful sequence of ACP client operations) + * through one codex adapter — selected by the POSTHOG_CODEX_USE_ACP env toggle — + * over the same in-process ACP transport the real host uses, and captures the + * full ACP stream (every sessionUpdate, every server→client requestPermission, + * and each call's response). Run the same scenario through both adapters and + * diff the captured streams to find parity gaps. No HTTP/JWT/Temporal. + */ +import { promises as fs } from "node:fs"; +import { resolve } from "node:path"; +// @ts-expect-error - resolved by tsx at runtime +import { ClientSideConnection, ndJsonStream } from "@agentclientprotocol/sdk"; +import { createAcpConnection } from "../src/adapters/acp-connection"; +import type { Logger } from "../src/utils/logger"; + +export type AdapterMode = "acp" | "app-server"; + +export interface CapturedEvent { + t: number; + kind: + | "step" + | "sessionUpdate" + | "requestPermission" + | "extNotification" + | "extMethod"; + op?: string; + sessionUpdate?: string; + data?: any; +} + +export interface CapturedRun { + adapter: AdapterMode; + scenario: string; + events: CapturedEvent[]; + stepResults: Array<{ op: string; ok: boolean; result?: any; error?: string }>; + fatalError?: string; +} + +export interface ScenarioCtx { + cwd: string; + model?: string; + /** Run one ACP operation, record it as a step boundary + its (redacted) result. */ + step(op: string, fn: () => Promise): Promise; +} + +export interface Scenario { + name: string; + run: (conn: any, ctx: ScenarioCtx) => Promise; +} + +export interface HarnessConfig { + cwd: string; + codexOptions: { + cwd: string; + binaryPath?: string; + apiBaseUrl?: string; + apiKey?: string; + model?: string; + reasoningEffort?: string; + }; + /** Override flag plumbing once the migration adds useCodexAppServer. */ + selectAppServer?: boolean; + timeoutMs?: number; + logger?: Logger; +} + +/** Keep result shapes comparable: drop big/nondeterministic blobs, keep structure. */ +function redact(value: any): any { + if (!value || typeof value !== "object") return value; + const out: any = {}; + for (const [k, v] of Object.entries(value)) { + if (k === "sessionId") out[k] = ""; + else if (k === "configOptions" && Array.isArray(v)) { + out[k] = v.map((o: any) => ({ + id: o?.id, + category: o?.category, + value: o?.value, + options: (o?.options ?? []).map((x: any) => x?.id ?? x?.optionId), + })); + } else if (k === "modes") { + out[k] = { + currentModeId: (v as any)?.currentModeId, + availableModes: ((v as any)?.availableModes ?? []).map( + (m: any) => m?.id, + ), + }; + } else if (k === "usage" && v && typeof v === "object") { + out[k] = Object.fromEntries( + Object.entries(v).map(([uk, uv]) => [ + uk, + typeof uv === "number" ? (uv > 0 ? ">0" : 0) : uv, + ]), + ); + } else if (typeof v === "string" && v.length > 120) + out[k] = ``; + else out[k] = v; + } + return out; +} + +export async function runScenario( + mode: AdapterMode, + scenario: Scenario, + cfg: HarnessConfig, +): Promise { + // Select the adapter. Until the migration adds a passed-in option, the env + // toggle is the only lever: set => codex-acp, unset => native app-server. + if (mode === "acp") process.env.POSTHOG_CODEX_USE_ACP = "1"; + else delete process.env.POSTHOG_CODEX_USE_ACP; + + const captured: CapturedRun = { + adapter: mode, + scenario: scenario.name, + events: [], + stepResults: [], + }; + let ord = 0; + + const client = { + async sessionUpdate(p: any): Promise { + captured.events.push({ + t: ord++, + kind: "sessionUpdate", + sessionUpdate: p?.update?.sessionUpdate, + data: p?.update, + }); + }, + async requestPermission(p: any): Promise { + captured.events.push({ + t: ord++, + kind: "requestPermission", + data: { + title: p?.toolCall?.title, + kind: p?.toolCall?.kind, + options: (p?.options ?? []).map((o: any) => ({ + id: o?.optionId, + kind: o?.kind, + })), + }, + }); + const allow = + (p?.options ?? []).find( + (o: any) => o?.kind === "allow_once" || o?.kind === "allow_always", + ) ?? p?.options?.[0]; + return { + outcome: { outcome: "selected", optionId: allow?.optionId ?? "allow" }, + }; + }, + async readTextFile(p: any): Promise { + return { content: await fs.readFile(resolve(cfg.cwd, p.path), "utf8") }; + }, + async writeTextFile(p: any): Promise { + await fs.writeFile(resolve(cfg.cwd, p.path), p.content); + return {}; + }, + // PostHog ext-notifications (_posthog/usage_update, _posthog/turn_complete, + // _posthog/sdk_session, ...) are part of the parity surface and are sent + // outside sessionUpdate — capture them so the report covers them. + async extNotification(method: string, params: any): Promise { + captured.events.push({ + t: ord++, + kind: "extNotification", + op: method, + data: redact(params), + }); + }, + async extMethod(method: string, params: any): Promise { + captured.events.push({ + t: ord++, + kind: "extMethod", + op: method, + data: redact(params), + }); + return {}; + }, + }; + + const acp = createAcpConnection({ + adapter: "codex", + codexOptions: cfg.codexOptions as any, + logger: cfg.logger, + }); + const stream = ndJsonStream( + acp.clientStreams.writable, + acp.clientStreams.readable, + ); + const conn = new ClientSideConnection(() => client, stream); + + const ctx: ScenarioCtx = { + cwd: cfg.cwd, + model: cfg.codexOptions.model, + async step(op, fn) { + captured.events.push({ t: ord++, kind: "step", op }); + const started = Date.now(); + console.error(` [step] ${op} ...`); + try { + const result = await fn(); + console.error(` [step] ${op} ✓ (${Date.now() - started}ms)`); + captured.stepResults.push({ op, ok: true, result: redact(result) }); + return result; + } catch (e: any) { + console.error( + ` [step] ${op} ✗ (${Date.now() - started}ms): ${String(e?.message ?? e)}`, + ); + captured.stepResults.push({ + op, + ok: false, + error: String(e?.message ?? e), + }); + throw e; + } + }, + }; + + const timeout = new Promise((_, rej) => + setTimeout( + () => + rej(new Error(`scenario timeout after ${cfg.timeoutMs ?? 180000}ms`)), + cfg.timeoutMs ?? 180000, + ), + ); + try { + await ctx.step("initialize", () => + conn.initialize({ + protocolVersion: 1, + clientCapabilities: { fs: { readTextFile: true, writeTextFile: true } }, + }), + ); + await Promise.race([scenario.run(conn, ctx), timeout]); + } catch (e: any) { + captured.fatalError = String(e?.message ?? e); + } finally { + // Bounded: a wedged adapter cleanup must never hang the loop. + await Promise.race([ + acp.cleanup().catch(() => undefined), + new Promise((resolve) => setTimeout(resolve, 5000)), + ]); + } + return captured; +} diff --git a/packages/agent/parity/run.ts b/packages/agent/parity/run.ts new file mode 100644 index 0000000000..01a7c4d7eb --- /dev/null +++ b/packages/agent/parity/run.ts @@ -0,0 +1,408 @@ +/** + * Parity runner: drive scenarios through both codex adapters, extract a + * normalized feature report from each ACP stream, and diff app-server vs + * codex-acp. Writes raw captures + parity-report.json to parity/out/. + * + * Usage (from packages/agent): + * PARITY_API_KEY= pnpm exec tsx parity/run.ts [--only acp|app-server] [--scenario name] + * Env: + * PARITY_GATEWAY_URL default http://localhost:3308/posthog_code/v1 + * PARITY_API_KEY PostHog token the local llm-gateway accepts (required for a live run) + * PARITY_MODEL default gpt-5.5 + */ +import { execFileSync } from "node:child_process"; +import { existsSync, mkdirSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { Logger } from "../src/utils/logger"; +import { + type AdapterMode, + type CapturedRun, + runScenario, + type Scenario, +} from "./harness"; + +const OUT_DIR = join(import.meta.dirname, "out"); +const RESOURCES = join( + import.meta.dirname, + "..", + "..", + "..", + "apps", + "code", + "resources", + "codex-acp", +); +const CODEX_ACP_BIN = join(RESOURCES, "codex-acp"); +const NATIVE_CODEX_BIN = join(RESOURCES, "codex"); +const GATEWAY = + process.env.PARITY_GATEWAY_URL ?? "http://localhost:3308/posthog_code/v1"; +const API_KEY = process.env.PARITY_API_KEY ?? ""; +const MODEL = process.env.PARITY_MODEL ?? "gpt-5.5"; +const REPO = "/tmp/codex-parity-repo"; + +const SCENARIOS: Scenario[] = [ + { + name: "basic-task", + async run(conn, ctx) { + const session = await ctx.step("newSession", () => + conn.newSession({ + cwd: ctx.cwd, + mcpServers: [], + _meta: { + sessionId: "parity", + systemPrompt: "You are a coding assistant in a tiny test repo.", + model: ctx.model, + permissionMode: "bypassPermissions", + }, + }), + ); + const sessionId = session.sessionId; + await ctx.step("prompt", () => + conn.prompt({ + sessionId, + prompt: [ + { + type: "text", + text: "Do exactly these steps and nothing else: 1) Read the file target.txt. 2) Edit it so the second line reads FOO instead of line2. 3) Run the shell command `cat target.txt`. 4) In one sentence confirm what you changed, then stop.", + }, + ], + }), + ); + }, + }, + { + name: "modes-and-resume", + async run(conn, ctx) { + const session = await ctx.step("newSession", () => + conn.newSession({ + cwd: ctx.cwd, + mcpServers: [], + _meta: { + sessionId: "parity2", + systemPrompt: "You are a coding assistant.", + model: ctx.model, + permissionMode: "auto", + }, + }), + ); + const sessionId = session.sessionId; + // Mode switch — codex-acp supports it; app-server gap until migration. + await ctx.step("setSessionConfigOption(mode)", () => + conn + .setSessionConfigOption({ + sessionId, + configId: "mode", + value: "read-only", + }) + .catch((e: any) => { + throw e; + }), + ); + await ctx.step("prompt", () => + conn.prompt({ + sessionId, + prompt: [ + { + type: "text", + text: "List the files in this repo with `ls`, then stop.", + }, + ], + }), + ); + // Resume in the same connection (host calls resumeSession on reconnect). + await ctx.step("resumeSession", () => + conn.resumeSession({ + sessionId, + cwd: ctx.cwd, + mcpServers: [], + _meta: { + systemPrompt: "You are a coding assistant.", + model: ctx.model, + }, + }), + ); + }, + }, +]; + +function extractFeatures(run: CapturedRun): Record { + const updateTypes = new Set(); + const toolKinds = new Set(); + const toolStatuses = new Set(); + let hasDiff = false; + let hasToolContent = false; + const approvals: string[] = []; + let usageFields = new Set(); + let modeUpdate = false; + const extNotifs = new Set(); + + for (const e of run.events) { + if (e.kind === "requestPermission") approvals.push(e.data?.kind ?? "?"); + if (e.kind === "extNotification") extNotifs.add(e.op ?? "?"); + if (e.kind !== "sessionUpdate") continue; + const u = e.sessionUpdate ?? "?"; + updateTypes.add(u); + const d = e.data ?? {}; + if (u === "tool_call") { + if (d.kind) toolKinds.add(d.kind); + if (d.status) toolStatuses.add(d.status); + } + if (u === "tool_call_update") { + if (d.status) toolStatuses.add(d.status); + const content = d.content ?? []; + if (Array.isArray(content)) { + for (const c of content) { + if (c?.type === "diff") hasDiff = true; + if (c?.type === "content") hasToolContent = true; + } + } + if ( + d.rawInput?.diff || + (typeof d.rawOutput === "string" && d.rawOutput.includes("diff")) + ) + hasDiff = true; + } + if (u === "current_mode_update" || u === "config_option_update") + modeUpdate = true; + if (u === "usage_update") + usageFields = new Set([ + ...usageFields, + ...Object.keys(d.usage ?? d ?? {}), + ]); + } + + // newSession response: configOptions / modes + const ns = run.stepResults.find((s) => s.op === "newSession")?.result ?? {}; + const configCategories = (ns.configOptions ?? []) + .map((o: any) => o.category) + .filter(Boolean); + const modes = ns.modes ?? null; + // prompt response usage / stopReason + const promptRes = run.stepResults + .filter((s) => s.op === "prompt") + .map((s) => s.result ?? {}); + const stopReasons = promptRes.map((r) => r.stopReason).filter(Boolean); + const promptUsage = promptRes.some( + (r) => r.usage && Object.keys(r.usage).length > 0, + ); + + return { + fatalError: run.fatalError ?? null, + updateTypes: [...updateTypes].sort(), + toolKinds: [...toolKinds].sort(), + toolStatuses: [...toolStatuses].sort(), + hasDiffContent: hasDiff, + hasToolContent: hasToolContent, + hasUsage: + promptUsage || + updateTypes.has("usage_update") || + extNotifs.has("_posthog/usage_update"), + usageFields: [...usageFields].sort(), + configOptionCategories: [...new Set(configCategories)].sort(), + modesPresent: !!modes, + modeChangeEmitted: modeUpdate, + approvalsRequested: approvals.length, + extNotifications: [...extNotifs].sort(), + stopReasons, + steps: run.stepResults.map((s) => ({ op: s.op, ok: s.ok, error: s.error })), + }; +} + +// Adapter-level features must match for parity. tool-rendering features depend +// on which tools the model chose (native codex edits via shell `execute`; +// codex-acp exposes Edit/Read) — a tool-surface difference, not an adapter bug — +// so they're reported as behavioral, not counted as parity gaps. +const ADAPTER_KEYS = [ + "fatalError", + "updateTypes", + "hasUsage", + "usageFields", + "configOptionCategories", + "modesPresent", + "modeChangeEmitted", + "extNotifications", + "stopReasons", +]; +const BEHAVIORAL_KEYS = [ + "toolKinds", + "toolStatuses", + "hasDiffContent", + "hasToolContent", +]; + +function diffFeatures( + acp: Record, + app: Record, +): Array<{ + feature: string; + acp: any; + appServer: any; + match: boolean; + behavioral: boolean; +}> { + const j = (v: any) => JSON.stringify(v); + const mk = (k: string, behavioral: boolean) => ({ + feature: k, + acp: acp[k], + appServer: app[k], + match: j(acp[k]) === j(app[k]), + behavioral, + }); + return [ + ...ADAPTER_KEYS.map((k) => mk(k, false)), + ...BEHAVIORAL_KEYS.map((k) => mk(k, true)), + ]; +} + +function setupRepo(): void { + if (!existsSync(REPO)) mkdirSync(REPO, { recursive: true }); + execFileSync("git", ["init", "-q"], { cwd: REPO }); + writeFileSync(join(REPO, "target.txt"), "line1\nline2\nline3\n"); + execFileSync("git", ["add", "-A"], { cwd: REPO }); + try { + // -c commit.gpgsign=false: ignore the user's global commit-signing config + // (e.g. 1Password SSH signer), which fails in this non-interactive context. + execFileSync( + "git", + [ + "-c", + "commit.gpgsign=false", + "-c", + "user.email=p@p.dev", + "-c", + "user.name=parity", + "commit", + "-qm", + "init", + ], + { cwd: REPO }, + ); + } catch { + /* already committed */ + } +} + +async function main(): Promise { + const args = process.argv.slice(2); + const only = args.includes("--only") + ? (args[args.indexOf("--only") + 1] as AdapterMode) + : null; + const scenarioFilter = args.includes("--scenario") + ? args[args.indexOf("--scenario") + 1] + : null; + mkdirSync(OUT_DIR, { recursive: true }); + setupRepo(); + + const modes: AdapterMode[] = []; + if (!only || only === "acp") modes.push("acp"); + if ((!only || only === "app-server") && existsSync(NATIVE_CODEX_BIN)) + modes.push("app-server"); + else if (only === "app-server") + console.warn( + `native codex binary missing at ${NATIVE_CODEX_BIN}; app-server arm skipped`, + ); + + const scenarios = SCENARIOS.filter( + (s) => !scenarioFilter || s.name === scenarioFilter, + ); + const logger = new Logger({ + debug: !!process.env.PARITY_DEBUG, + prefix: "[parity]", + }); + const featuresByMode: Record> = {}; + + for (const scenario of scenarios) { + featuresByMode[scenario.name] = {}; + for (const mode of modes) { + console.log(`\n▶ ${scenario.name} via ${mode} ...`); + // codex spawns detached (own process group); a timed-out run orphans it + // holding a flock under ~/.codex/tmp, which wedges the next run. Kill any + // stragglers first — process death releases the flock. (Uses the default + // CODEX_HOME: an isolated empty home makes codex-acp crash at startup.) + try { + execFileSync("pkill", ["-9", "-f", "resources/codex-acp"], { + stdio: "ignore", + }); + } catch { + /* none running */ + } + const cfg = { + cwd: REPO, + codexOptions: { + cwd: REPO, + binaryPath: CODEX_ACP_BIN, + apiBaseUrl: GATEWAY, + apiKey: API_KEY, + model: MODEL, + }, + timeoutMs: 240000, + logger, + }; + const run = await runScenario(mode, scenario, cfg); + writeFileSync( + join(OUT_DIR, `${scenario.name}.${mode}.json`), + JSON.stringify(run, null, 2), + ); + const feats = extractFeatures(run); + featuresByMode[scenario.name][mode] = feats; + writeFileSync( + join(OUT_DIR, `${scenario.name}.${mode}.features.json`), + JSON.stringify(feats, null, 2), + ); + console.log( + ` steps: ${feats.steps.map((s: any) => `${s.op}${s.ok ? "✓" : "✗"}`).join(" ")}`, + ); + console.log( + ` updates: ${feats.updateTypes.join(",")} | tools: ${feats.toolKinds.join(",")} | usage:${feats.hasUsage} diff:${feats.hasDiffContent} stop:${feats.stopReasons.join(",")}`, + ); + if (feats.fatalError) console.log(` ⚠ fatalError: ${feats.fatalError}`); + } + } + + // Diff report (only meaningful when both arms ran) + const report: any = { gateway: GATEWAY, model: MODEL, scenarios: {} }; + let totalGaps = 0; + for (const scenario of scenarios) { + const acp = featuresByMode[scenario.name].acp; + const app = featuresByMode[scenario.name]["app-server"]; + if (acp && app) { + const diff = diffFeatures(acp, app); + const gaps = diff.filter((d) => !d.match && !d.behavioral); + const behavioral = diff.filter((d) => !d.match && d.behavioral); + totalGaps += gaps.length; + report.scenarios[scenario.name] = { + gaps, + behavioral, + allMatch: gaps.length === 0, + }; + console.log(`\n=== parity diff: ${scenario.name} ===`); + if (!gaps.length) console.log(" ✅ adapter parity"); + for (const g of gaps) + console.log( + ` ✗ ${g.feature}: acp=${JSON.stringify(g.acp)} app-server=${JSON.stringify(g.appServer)}`, + ); + for (const b of behavioral) + console.log( + ` · behavioral: ${b.feature} acp=${JSON.stringify(b.acp)} app-server=${JSON.stringify(b.appServer)}`, + ); + } else { + report.scenarios[scenario.name] = { + baselineOnly: acp ? "acp" : "app-server", + features: acp ?? app, + }; + } + } + writeFileSync( + join(OUT_DIR, "parity-report.json"), + JSON.stringify(report, null, 2), + ); + console.log( + `\nWrote ${join(OUT_DIR, "parity-report.json")} — ${totalGaps} parity gap(s).`, + ); + process.exit(totalGaps > 0 ? 1 : 0); +} + +main().catch((e) => { + console.error("parity runner failed:", e); + process.exit(2); +}); diff --git a/packages/agent/src/adapters/acp-connection.test.ts b/packages/agent/src/adapters/acp-connection.test.ts new file mode 100644 index 0000000000..b1b1d82833 --- /dev/null +++ b/packages/agent/src/adapters/acp-connection.test.ts @@ -0,0 +1,47 @@ +import { afterEach, describe, expect, it } from "vitest"; +import { resolveUseCodexAppServer } from "./acp-connection"; + +describe("resolveUseCodexAppServer", () => { + const saved = { + app: process.env.POSTHOG_CODEX_USE_APP_SERVER, + acp: process.env.POSTHOG_CODEX_USE_ACP, + }; + afterEach(() => { + if (saved.app === undefined) + delete process.env.POSTHOG_CODEX_USE_APP_SERVER; + else process.env.POSTHOG_CODEX_USE_APP_SERVER = saved.app; + if (saved.acp === undefined) delete process.env.POSTHOG_CODEX_USE_ACP; + else process.env.POSTHOG_CODEX_USE_ACP = saved.acp; + }); + + it("host flag wins over env and default", () => { + process.env.POSTHOG_CODEX_USE_ACP = "1"; + process.env.POSTHOG_CODEX_USE_APP_SERVER = "1"; + expect(resolveUseCodexAppServer({ useCodexAppServer: false })).toBe(false); + expect(resolveUseCodexAppServer({ useCodexAppServer: true })).toBe(true); + }); + + it("POSTHOG_CODEX_USE_APP_SERVER=1 forces app-server", () => { + delete process.env.POSTHOG_CODEX_USE_ACP; + process.env.POSTHOG_CODEX_USE_APP_SERVER = "1"; + expect(resolveUseCodexAppServer({})).toBe(true); + }); + + it("POSTHOG_CODEX_USE_ACP=1 forces codex-acp", () => { + delete process.env.POSTHOG_CODEX_USE_APP_SERVER; + process.env.POSTHOG_CODEX_USE_ACP = "1"; + expect(resolveUseCodexAppServer({})).toBe(false); + }); + + it("defaults to codex-acp when nothing is set (app-server is opt-in)", () => { + delete process.env.POSTHOG_CODEX_USE_APP_SERVER; + delete process.env.POSTHOG_CODEX_USE_ACP; + expect(resolveUseCodexAppServer({})).toBe(false); + }); + + it("host flag false beats POSTHOG_CODEX_USE_APP_SERVER=1", () => { + process.env.POSTHOG_CODEX_USE_APP_SERVER = "1"; + delete process.env.POSTHOG_CODEX_USE_ACP; + expect(resolveUseCodexAppServer({ useCodexAppServer: false })).toBe(false); + }); +}); diff --git a/packages/agent/src/adapters/acp-connection.ts b/packages/agent/src/adapters/acp-connection.ts index 97251c8b7c..f67dbb1f10 100644 --- a/packages/agent/src/adapters/acp-connection.ts +++ b/packages/agent/src/adapters/acp-connection.ts @@ -27,6 +27,14 @@ export type AcpConnectionConfig = { processCallbacks?: ProcessSpawnedCallback; codexOptions?: CodexProcessOptions; allowedModelIds?: Set; + /** + * Feature-flag lever for the codex sub-adapter, passed by the host from the + * `codex-app-server` PostHog flag (gradual rollout / kill-switch). `true` => + * native app-server, `false` => codex-acp. When undefined, falls back to env + * overrides then the default (codex-acp). Lets app-server roll out alongside + * codex-acp without a code change. + */ + useCodexAppServer?: boolean; /** Callback invoked when the agent calls the create_output tool for structured output */ onStructuredOutput?: (output: Record) => Promise; /** PostHog API config; when set, enables file-read enrichment unless disabled. */ @@ -70,6 +78,24 @@ function resolveEnricherApiConfig( return enabled ? config.posthogApiConfig : undefined; } +/** + * Resolves which codex sub-adapter to use. Precedence: host flag + * (`config.useCodexAppServer`, from the `codex-app-server` PostHog flag) > env + * overrides (`POSTHOG_CODEX_USE_APP_SERVER=1` / `POSTHOG_CODEX_USE_ACP=1`) > + * default (codex-acp, the proven fallback). The native app-server is opt-in: + * the host turns it on per-user via the flag (cloud passes the resolved env; + * desktop passes `useCodexAppServer`), so it can roll out alongside codex-acp + * without a code change and be killed instantly by flipping the flag off. + */ +export function resolveUseCodexAppServer(config: AcpConnectionConfig): boolean { + if (typeof config.useCodexAppServer === "boolean") { + return config.useCodexAppServer; + } + if (process.env.POSTHOG_CODEX_USE_APP_SERVER === "1") return true; + if (process.env.POSTHOG_CODEX_USE_ACP === "1") return false; + return false; +} + function createClaudeConnection(config: AcpConnectionConfig): AcpConnection { const logger = config.logger?.child("AcpConnection") ?? @@ -210,10 +236,18 @@ function createCodexConnection(config: AcpConnectionConfig): AcpConnection { const codexOptions = config.codexOptions ?? {}; const nativeBinary = nativeCodexBinaryPath(codexOptions.binaryPath); - // The native app-server is the default Codex harness. Fall back to the - // codex-acp (Zed) adapter only when the codex binary isn't bundled or when - // POSTHOG_CODEX_USE_ACP is set as an escape hatch. - if (nativeBinary && process.env.POSTHOG_CODEX_USE_ACP !== "1") { + // Use the native app-server when its binary is bundled AND the host (flag) + // / env selects it. See resolveUseCodexAppServer for precedence. + const useAppServer = !!nativeBinary && resolveUseCodexAppServer(config); + logger.info( + `Codex sub-adapter selected: ${useAppServer ? "app-server (native codex)" : "codex-acp"}`, + { + useAppServer, + nativeBinaryFound: !!nativeBinary, + hostFlag: config.useCodexAppServer, + }, + ); + if (useAppServer) { agent = new CodexAppServerAgent(client, { processOptions: { binaryPath: nativeBinary, @@ -221,10 +255,12 @@ function createCodexConnection(config: AcpConnectionConfig): AcpConnection { apiBaseUrl: codexOptions.apiBaseUrl, apiKey: codexOptions.apiKey, developerInstructions: codexOptions.developerInstructions, + configOverrides: codexOptions.configOverrides, }, model: codexOptions.model, reasoningEffort: codexOptions.reasoningEffort, processCallbacks: config.processCallbacks, + onStructuredOutput: config.onStructuredOutput, logger: config.logger?.child("CodexAppServerAgent"), }); return agent; diff --git a/packages/agent/src/adapters/codex-app-server/app-server-client.test.ts b/packages/agent/src/adapters/codex-app-server/app-server-client.test.ts index db734950b0..cc688e061d 100644 --- a/packages/agent/src/adapters/codex-app-server/app-server-client.test.ts +++ b/packages/agent/src/adapters/codex-app-server/app-server-client.test.ts @@ -7,17 +7,14 @@ import { import { AppServerClient } from "./app-server-client"; interface RpcMessage { - id?: number; + id?: number | string; method?: string; params?: unknown; result?: unknown; error?: { code: number; message: string }; } -/** - * Drives the "server" end of a {@link StreamPair}: reads newline-delimited - * JSON-RPC the client sent and writes framed responses/notifications back. - */ +/** Drives the "server" end of a {@link StreamPair}: reads client JSON-RPC and writes framed replies back. */ function makeFakeServer(transport: StreamPair) { const writer = transport.writable.getWriter(); const reader = transport.readable.getReader(); @@ -142,6 +139,28 @@ describe("AppServerClient", () => { await client.close(); }); + it("answers a server request with a STRING id (RequestId is string|number)", async () => { + const streams = createBidirectionalStreams(); + const onRequest = vi.fn(async () => ({ decision: "approved" })); + const client = new AppServerClient(streams.client, { + logger: silentLogger, + onRequest, + }); + const server = makeFakeServer(streams.agent); + + await server.send({ + id: "req-abc", + method: "item/commandExecution/requestApproval", + params: {}, + }); + + const response = await server.readMessage(); + expect(onRequest).toHaveBeenCalledTimes(1); + expect(response.id).toBe("req-abc"); + expect(response.result).toEqual({ decision: "approved" }); + await client.close(); + }); + it("rejects in-flight requests when closed", async () => { const streams = createBidirectionalStreams(); const client = new AppServerClient(streams.client, { diff --git a/packages/agent/src/adapters/codex-app-server/app-server-client.ts b/packages/agent/src/adapters/codex-app-server/app-server-client.ts index 1fc5564ced..c437155be3 100644 --- a/packages/agent/src/adapters/codex-app-server/app-server-client.ts +++ b/packages/agent/src/adapters/codex-app-server/app-server-client.ts @@ -1,14 +1,11 @@ import { Logger } from "../../utils/logger"; import type { StreamPair } from "../../utils/streams"; -import type { JsonRpcMessage, JsonRpcResponse } from "./protocol"; +import type { JsonRpcMessage, JsonRpcResponse, RequestId } from "./protocol"; export interface AppServerClientHandlers { /** Server-pushed notification (no id), e.g. `item/agentMessage/delta`. */ onNotification?: (method: string, params: unknown) => void; - /** - * Server-initiated request (has an id), e.g. an approval. The resolved value - * is returned to the server as the JSON-RPC result. - */ + /** Server-initiated request (has an id), e.g. an approval; the resolved value is returned as the JSON-RPC result. */ onRequest?: (method: string, params: unknown) => Promise; /** Fired once when the stream ends without an explicit close() (process exit). */ onClose?: () => void; @@ -28,17 +25,13 @@ export interface AppServerRpc { } /** - * Bidirectional newline-delimited JSON-RPC client for the native Codex - * `app-server` subprocess. Unlike the codex-acp adapter this speaks Codex's - * own protocol rather than ACP, so it cannot reuse the ACP SDK connection. - * - * Transport-agnostic: it is given a {@link StreamPair} so tests can drive it - * over in-memory streams without spawning a process. + * Bidirectional newline-delimited JSON-RPC client for the native Codex `app-server` subprocess. + * Transport-agnostic via a {@link StreamPair} so tests can drive it over in-memory streams. */ export class AppServerClient implements AppServerRpc { private readonly writer: WritableStreamDefaultWriter; private readonly encoder = new TextEncoder(); - private readonly pending = new Map(); + private readonly pending = new Map(); private readonly handlers: AppServerClientHandlers; private readonly logger: Logger; private reader?: ReadableStreamDefaultReader; @@ -126,9 +119,7 @@ export class AppServerClient implements AppServerRpc { // lock already released by cancel() } if (!this.closed) { - // The stream ended without an explicit close() (the process exited). - // Fail in-flight calls and notify the owner so a pending turn does not - // hang forever. + // Stream ended without close() (process exited): fail in-flight calls so the turn doesn't hang. this.closed = true; for (const call of this.pending.values()) { call.reject(new Error("codex app-server stream closed")); @@ -151,20 +142,22 @@ export class AppServerClient implements AppServerRpc { const id = (message as { id?: unknown }).id; const method = (message as { method?: unknown }).method; const params = (message as { params?: unknown }).params; - - if (typeof method !== "string") { - if (typeof id === "number") { - this.handleResponse(message as JsonRpcResponse); + // Discriminate on id presence, not `typeof id === "number"` — RequestId is + // string|number, so a string-id server request must still be answered. + const hasId = id !== undefined && id !== null; + + if (typeof method === "string") { + if (hasId) { + void this.handleIncomingRequest(id as RequestId, method, params); + } else { + this.handlers.onNotification?.(method, params); } return; } - if (typeof id === "number") { - void this.handleIncomingRequest(id, method, params); - return; + if (hasId) { + this.handleResponse(message as JsonRpcResponse); } - - this.handlers.onNotification?.(method, params); } private handleResponse(message: JsonRpcResponse): void { @@ -182,7 +175,7 @@ export class AppServerClient implements AppServerRpc { } private async handleIncomingRequest( - id: number, + id: RequestId, method: string, params: unknown, ): Promise { diff --git a/packages/agent/src/adapters/codex-app-server/approvals.test.ts b/packages/agent/src/adapters/codex-app-server/approvals.test.ts new file mode 100644 index 0000000000..f1ea74b941 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/approvals.test.ts @@ -0,0 +1,322 @@ +import type { + RequestPermissionRequest, + RequestPermissionResponse, +} from "@agentclientprotocol/sdk"; +import { describe, expect, it, vi } from "vitest"; +import { QuestionMetaSchema } from "../claude/questions/utils"; +import { handleServerRequest } from "./approvals"; +import { APP_SERVER_REQUESTS } from "./protocol"; + +// A fake ACP client whose requestPermission returns queued outcomes positionally. +function fakeClient(outcomes: RequestPermissionResponse["outcome"][]) { + const calls: RequestPermissionRequest[] = []; + let next = 0; + const requestPermission = vi.fn( + async ( + params: RequestPermissionRequest, + ): Promise => { + calls.push(params); + const outcome = outcomes[next++] ?? { outcome: "cancelled" as const }; + return { outcome }; + }, + ); + return { client: { requestPermission }, calls }; +} + +const opts = { sessionId: "sess-1" }; + +describe("handleServerRequest", () => { + it("maps a requestUserInput question's selected option back to an answer", async () => { + const { client, calls } = fakeClient([ + { outcome: "selected", optionId: "option_1" }, + ]); + + const params = { + threadId: "t", + turnId: "turn", + itemId: "item-9", + autoResolutionMs: null, + questions: [ + { + id: "q1", + header: "Pick one", + question: "Which environment?", + isOther: false, + isSecret: false, + options: [ + { label: "staging", description: "" }, + { label: "production", description: "danger" }, + ], + }, + ], + }; + + const result = await handleServerRequest( + APP_SERVER_REQUESTS.TOOL_USER_INPUT, + params, + client, + opts, + ); + + expect(result.handled).toBe(true); + expect(result.response).toEqual({ + answers: { q1: { answers: ["production"] } }, + }); + + expect(calls).toHaveLength(1); + expect(calls[0].sessionId).toBe("sess-1"); + expect(calls[0].options.map((o) => o.name)).toEqual([ + "staging", + "production", + ]); + }); + + it("carries a QuestionMetaSchema-valid questions array so the host card renders", async () => { + const { client, calls } = fakeClient([ + { outcome: "selected", optionId: "option_0" }, + ]); + + const params = { + threadId: "t", + turnId: "turn", + itemId: "item-1", + autoResolutionMs: null, + questions: [ + { + id: "q1", + header: "Environment", + question: "Which environment?", + isOther: false, + isSecret: false, + options: [ + { label: "staging", description: "" }, + { label: "production", description: "danger" }, + ], + }, + ], + }; + + await handleServerRequest( + APP_SERVER_REQUESTS.TOOL_USER_INPUT, + params, + client, + opts, + ); + + // A bare `{ header }` _meta fails QuestionMetaSchema, rendering an empty card. + const parsed = QuestionMetaSchema.safeParse(calls[0].toolCall?._meta); + expect(parsed.success).toBe(true); + expect(parsed.data?.questions).toEqual([ + { + question: "Which environment?", + header: "Environment", + // The non-empty description rides along; the empty one is dropped. + options: [ + { label: "staging" }, + { label: "production", description: "danger" }, + ], + }, + ]); + }); + + it("defaults a cancelled question to an empty answer", async () => { + const { client } = fakeClient([{ outcome: "cancelled" }]); + + const params = { + threadId: "t", + turnId: "turn", + itemId: "item-1", + autoResolutionMs: null, + questions: [ + { + id: "q1", + header: "h", + question: "q?", + isOther: false, + isSecret: false, + options: [{ label: "a", description: "" }], + }, + ], + }; + + const result = await handleServerRequest( + APP_SERVER_REQUESTS.TOOL_USER_INPUT, + params, + client, + opts, + ); + + expect(result.response).toEqual({ answers: { q1: { answers: [] } } }); + }); + + it.each([ + // "allow_once" grants for the turn, not session-wide; reject grants nothing. + { optionId: "allow", expected: { network: { enabled: true } } }, + { optionId: "reject", expected: {} }, + ])( + "resolves a permission approval on $optionId", + async ({ optionId, expected }) => { + const { client } = fakeClient([{ outcome: "selected", optionId }]); + + const params = { + threadId: "t", + turnId: "turn", + itemId: "perm-1", + environmentId: null, + startedAtMs: 0, + cwd: "/repo", + reason: "needs network", + permissions: { + network: { enabled: true }, + fileSystem: null, + }, + }; + + const result = await handleServerRequest( + APP_SERVER_REQUESTS.PERMISSIONS_APPROVAL, + params, + client, + opts, + ); + + expect(result.handled).toBe(true); + expect(result.response).toEqual({ + permissions: expected, + scope: "turn", + }); + }, + ); + + it("fails closed to the safe default when a payload is malformed", async () => { + const { client } = fakeClient([{ outcome: "selected", optionId: "allow" }]); + const result = await handleServerRequest( + APP_SERVER_REQUESTS.PERMISSIONS_APPROVAL, + null, + client, + opts, + ); + expect(result).toEqual({ + handled: true, + response: { permissions: {}, scope: "turn" }, + }); + }); + + it.each([ + { optionId: "accept", action: "accept", content: {} }, + { optionId: "decline", action: "decline", content: null }, + ])( + "resolves an elicitation on $optionId", + async ({ optionId, action, content }) => { + const { client } = fakeClient([{ outcome: "selected", optionId }]); + + const result = await handleServerRequest( + APP_SERVER_REQUESTS.MCP_ELICITATION, + { + threadId: "t", + turnId: "turn", + serverName: "posthog", + mode: "form", + message: "Confirm the export", + }, + client, + opts, + ); + + expect(result.handled).toBe(true); + expect(result.response).toEqual({ action, content, _meta: null }); + }, + ); + + it("enriches an elicitation with the in-flight MCP tool call so the host renders the real tool", async () => { + const { client, calls } = fakeClient([ + { outcome: "selected", optionId: "accept" }, + ]); + + await handleServerRequest( + APP_SERVER_REQUESTS.MCP_ELICITATION, + { + threadId: "t", + turnId: "turn", + serverName: "posthog", + mode: "form", + message: 'Allow the posthog MCP server to run tool "exec"?', + }, + client, + { + ...opts, + resolveMcpToolCall: (serverName) => + serverName === "posthog" + ? { + server: "posthog", + tool: "exec", + args: { command: "search project|insight" }, + } + : undefined, + }, + ); + + expect(calls[0].toolCall).toMatchObject({ + toolCallId: "posthog:elicitation", + rawInput: { command: "search project|insight" }, + _meta: { + posthog: { + toolName: "mcp__posthog__exec", + mcp: { server: "posthog", tool: "exec" }, + }, + }, + }); + }); + + it("falls back to codex's generic elicitation text when no MCP call correlates", async () => { + const { client, calls } = fakeClient([ + { outcome: "selected", optionId: "decline" }, + ]); + + await handleServerRequest( + APP_SERVER_REQUESTS.MCP_ELICITATION, + { + threadId: "t", + turnId: "t", + serverName: "posthog", + mode: "form", + message: "Confirm", + }, + client, + // resolveMcpToolCall absent (e.g. server mismatch) → no enrichment. + opts, + ); + + expect(calls[0].toolCall).not.toHaveProperty("_meta"); + expect(calls[0].toolCall).toMatchObject({ + toolCallId: "posthog:elicitation", + title: "Confirm", + }); + }); + + it("returns handled:false for the simple command approval (caller owns it)", async () => { + const { client, calls } = fakeClient([]); + + const result = await handleServerRequest( + APP_SERVER_REQUESTS.COMMAND_APPROVAL, + { itemId: "x", command: "ls" }, + client, + opts, + ); + + expect(result).toEqual({ handled: false, response: undefined }); + expect(calls).toHaveLength(0); + }); + + it("returns handled:false for an unknown method", async () => { + const { client } = fakeClient([]); + + const result = await handleServerRequest( + "some/unknown/method", + {}, + client, + opts, + ); + + expect(result).toEqual({ handled: false, response: undefined }); + }); +}); diff --git a/packages/agent/src/adapters/codex-app-server/approvals.ts b/packages/agent/src/adapters/codex-app-server/approvals.ts new file mode 100644 index 0000000000..950e978307 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/approvals.ts @@ -0,0 +1,392 @@ +/** + * Handlers for the richer Codex app-server server-requests that carry a typed + * response object rather than a yes/no decision string (requestUserInput, + * permissions/requestApproval, mcpServer/elicitation). Each is surfaced through + * ACP `requestPermission`; on cancel/error we default to the safe outcome so a + * dropped prompt never silently grants access. + */ + +import type { + AgentSideConnection, + PermissionOption, + RequestPermissionResponse, +} from "@agentclientprotocol/sdk"; +import { mcpToolKey, posthogToolMeta } from "@posthog/shared"; +import type { Logger } from "../../utils/logger"; +import { OPTION_PREFIX } from "../claude/questions/utils"; +import { APP_SERVER_REQUESTS } from "./protocol"; + +// Native app-server shapes, re-declared locally so this module doesn't depend on +// the generated schema at build time. + +interface ToolRequestUserInputOption { + label: string; + description: string; +} + +interface ToolRequestUserInputQuestion { + id: string; + header: string; + question: string; + isOther: boolean; + isSecret: boolean; + options: ToolRequestUserInputOption[] | null; +} + +interface ToolRequestUserInputParams { + threadId: string; + turnId: string; + itemId: string; + questions: ToolRequestUserInputQuestion[]; + autoResolutionMs: number | null; +} + +interface ToolRequestUserInputResponse { + answers: { [questionId: string]: { answers: string[] } }; +} + +interface AdditionalNetworkPermissions { + enabled: boolean | null; +} + +interface AdditionalFileSystemPermissions { + read: string[] | null; + write: string[] | null; + globScanMaxDepth?: number; + entries?: unknown[]; +} + +interface RequestPermissionProfile { + network: AdditionalNetworkPermissions | null; + fileSystem: AdditionalFileSystemPermissions | null; +} + +interface PermissionsRequestApprovalParams { + threadId: string; + turnId: string; + itemId: string; + environmentId: string | null; + startedAtMs: number; + cwd: string; + reason: string | null; + permissions: RequestPermissionProfile; +} + +interface GrantedPermissionProfile { + network?: AdditionalNetworkPermissions; + fileSystem?: AdditionalFileSystemPermissions; +} + +type PermissionGrantScope = "turn" | "session"; + +interface PermissionsRequestApprovalResponse { + permissions: GrantedPermissionProfile; + scope: PermissionGrantScope; +} + +type McpServerElicitationAction = "accept" | "decline" | "cancel"; + +interface McpServerElicitationRequestParams { + threadId: string; + turnId: string | null; + serverName: string; + mode: "form" | "url"; + message: string; + // Only `message` is needed to render the prompt; the rest stays untyped. + [key: string]: unknown; +} + +interface McpServerElicitationRequestResponse { + action: McpServerElicitationAction; + content: unknown | null; + _meta?: unknown | null; +} + +export interface HandleServerRequestResult { + // false → not a richer request; the caller handles it (simple approvals). + handled: boolean; + response: unknown; +} + +export interface HandleServerRequestOptions { + sessionId: string; + logger?: Logger; + /** + * Resolve the in-flight MCP tool call for an elicitation's `serverName`. codex's + * elicitation carries no tool/args, so supplying the originating `mcpToolCall` + * lets the prompt render the real operation. Undefined → codex's generic text. + */ + resolveMcpToolCall?: ( + serverName: string, + ) => { server: string; tool: string; args: unknown } | undefined; +} + +/** + * Routes a server-initiated request to the matching richer-response handler. + * Returns `{ handled: false }` for anything this module doesn't own. + */ +export async function handleServerRequest( + method: string, + params: unknown, + client: Pick, + opts: HandleServerRequestOptions, +): Promise { + try { + switch (method) { + case APP_SERVER_REQUESTS.TOOL_USER_INPUT: + return { + handled: true, + response: await handleToolUserInput( + params as ToolRequestUserInputParams, + client, + opts, + ), + }; + case APP_SERVER_REQUESTS.PERMISSIONS_APPROVAL: + return { + handled: true, + response: await handlePermissionsApproval( + params as PermissionsRequestApprovalParams, + client, + opts, + ), + }; + case APP_SERVER_REQUESTS.MCP_ELICITATION: + return { + handled: true, + response: await handleMcpElicitation( + params as McpServerElicitationRequestParams, + client, + opts, + ), + }; + default: + return { handled: false, response: undefined }; + } + } catch (err) { + // Malformed payload fails closed to the safe default — never throw, never grant. + opts.logger?.warn("server-request handler threw; failing closed", { + method, + error: String(err), + }); + return { handled: true, response: safeDefaultFor(method) }; + } +} + +function safeDefaultFor(method: string): unknown { + if (method === APP_SERVER_REQUESTS.PERMISSIONS_APPROVAL) { + return { permissions: {}, scope: "turn" }; + } + if (method === APP_SERVER_REQUESTS.MCP_ELICITATION) { + return { action: "decline", content: null, _meta: null }; + } + return { answers: {} }; +} + +function buildQuestionOptions( + question: ToolRequestUserInputQuestion, +): PermissionOption[] { + return (question.options ?? []).map((opt, idx) => ({ + kind: "allow_once" as const, + name: opt.label, + optionId: `${OPTION_PREFIX}${idx}`, + _meta: opt.description ? { description: opt.description } : undefined, + })); +} + +// Maps a selected optionId (`option_`) back to the chosen option's label. +function answerFromSelection( + question: ToolRequestUserInputQuestion, + optionId: string | undefined, +): string[] { + if (!optionId || !optionId.startsWith(OPTION_PREFIX)) { + return []; + } + const idx = Number(optionId.slice(OPTION_PREFIX.length)); + const opt = question.options?.[idx]; + return opt ? [opt.label] : []; +} + +async function handleToolUserInput( + params: ToolRequestUserInputParams, + client: Pick, + opts: HandleServerRequestOptions, +): Promise { + const answers: ToolRequestUserInputResponse["answers"] = {}; + + for (const question of params.questions ?? []) { + // Default to "no answer" so cancel/failure leaves a well-formed empty response. + answers[question.id] = { answers: [] }; + + const options = buildQuestionOptions(question); + // Free-text questions have no options; requestPermission can't collect them. + if (options.length === 0) { + continue; + } + + let response: RequestPermissionResponse; + try { + response = await client.requestPermission({ + sessionId: opts.sessionId, + options, + toolCall: { + toolCallId: `${params.itemId}:${question.id}`, + title: question.question, + kind: "other", + // The host's QuestionPermission renders from `_meta.questions`; a bare + // `header` renders empty. codex prompts one question per request. + _meta: { + codeToolKind: "question", + questions: [ + { + question: question.question, + header: question.header, + options: (question.options ?? []).map((opt) => ({ + label: opt.label, + ...(opt.description?.trim() + ? { description: opt.description } + : {}), + })), + }, + ], + }, + }, + }); + } catch (err) { + opts.logger?.warn("requestUserInput prompt failed; leaving empty", { + questionId: question.id, + error: String(err), + }); + continue; + } + + if (response.outcome.outcome !== "selected") { + continue; + } + answers[question.id] = { + answers: answerFromSelection(question, response.outcome.optionId), + }; + } + + return { answers }; +} + +async function handlePermissionsApproval( + params: PermissionsRequestApprovalParams, + client: Pick, + opts: HandleServerRequestOptions, +): Promise { + const denied: PermissionsRequestApprovalResponse = { + permissions: {}, + scope: "turn", + }; + + let response: RequestPermissionResponse; + try { + response = await client.requestPermission({ + sessionId: opts.sessionId, + options: [ + { kind: "allow_once", name: "Allow", optionId: "allow" }, + { kind: "reject_once", name: "Reject", optionId: "reject" }, + ], + toolCall: { + toolCallId: params.itemId, + title: params.reason ?? "Grant additional permissions", + kind: "other", + }, + }); + } catch (err) { + opts.logger?.warn("permissions approval prompt failed; denying", { + itemId: params.itemId, + error: String(err), + }); + return denied; + } + + if ( + response.outcome.outcome === "selected" && + response.outcome.optionId === "allow" + ) { + // Grant only what was requested, scoped to this turn (option is "allow_once"). + return { + permissions: grantedFromRequested(params.permissions), + scope: "turn", + }; + } + return denied; +} + +function grantedFromRequested( + requested: RequestPermissionProfile, +): GrantedPermissionProfile { + const granted: GrantedPermissionProfile = {}; + if (requested.network) { + granted.network = requested.network; + } + if (requested.fileSystem) { + granted.fileSystem = requested.fileSystem; + } + return granted; +} + +async function handleMcpElicitation( + params: McpServerElicitationRequestParams, + client: Pick, + opts: HandleServerRequestOptions, +): Promise { + const declined: McpServerElicitationRequestResponse = { + action: "decline", + content: null, + _meta: null, + }; + + // If the elicitation gates a known in-flight MCP call, carry its real tool + + // args + `_meta.posthog` so the host renders the proper MCP permission. + const mcp = opts.resolveMcpToolCall?.(params.serverName); + const toolCall = mcp + ? { + toolCallId: `${params.serverName}:elicitation`, + title: params.message || `${params.serverName} requests input`, + kind: "other" as const, + rawInput: mcp.args, + _meta: posthogToolMeta({ + toolName: mcpToolKey({ server: mcp.server, tool: mcp.tool }), + mcp: { server: mcp.server, tool: mcp.tool }, + }), + } + : { + toolCallId: `${params.serverName}:elicitation`, + title: params.message || `${params.serverName} requests input`, + kind: "other" as const, + }; + + let response: RequestPermissionResponse; + try { + response = await client.requestPermission({ + sessionId: opts.sessionId, + options: [ + { kind: "allow_once", name: "Accept", optionId: "accept" }, + { kind: "reject_once", name: "Decline", optionId: "decline" }, + ], + toolCall, + }); + } catch (err) { + opts.logger?.warn("elicitation prompt failed; declining", { + serverName: params.serverName, + error: String(err), + }); + return declined; + } + + if (response.outcome.outcome === "cancelled") { + return { action: "cancel", content: null, _meta: null }; + } + if ( + response.outcome.outcome === "selected" && + response.outcome.optionId === "accept" + ) { + // No structured form UI over requestPermission; accept with empty content. + return { action: "accept", content: {}, _meta: null }; + } + return declined; +} diff --git a/packages/agent/src/adapters/codex-app-server/binary-path.test.ts b/packages/agent/src/adapters/codex-app-server/binary-path.test.ts index f8e46a544d..27d472ff02 100644 --- a/packages/agent/src/adapters/codex-app-server/binary-path.test.ts +++ b/packages/agent/src/adapters/codex-app-server/binary-path.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it, vi } from "vitest"; +import { beforeEach, describe, expect, it, vi } from "vitest"; const existsSyncMock = vi.hoisted(() => vi.fn()); vi.mock("node:fs", async (importOriginal) => ({ @@ -6,24 +6,44 @@ vi.mock("node:fs", async (importOriginal) => ({ existsSync: existsSyncMock, })); +const resolveMock = vi.hoisted(() => vi.fn()); +vi.mock("node:module", async (importOriginal) => ({ + ...(await importOriginal()), + createRequire: () => ({ resolve: resolveMock }), +})); + const { nativeCodexBinaryPath } = await import("./binary-path"); describe("nativeCodexBinaryPath", () => { - it("returns undefined without a codex-acp path", () => { - expect(nativeCodexBinaryPath(undefined)).toBeUndefined(); - }); - - it("returns undefined when the sibling codex binary is absent", () => { - existsSyncMock.mockReturnValue(false); - expect( - nativeCodexBinaryPath("/bundle/codex-acp/codex-acp"), - ).toBeUndefined(); + beforeEach(() => { + existsSyncMock.mockReset(); + resolveMock.mockReset(); }); - it("returns the sibling codex binary when present", () => { + it("returns the sibling codex binary bundled next to codex-acp when present", () => { existsSyncMock.mockReturnValue(true); expect(nativeCodexBinaryPath("/bundle/codex-acp/codex-acp")).toBe( "/bundle/codex-acp/codex", ); }); + + it("falls back to the @openai/codex vendored binary when no sibling is bundled", () => { + resolveMock.mockReturnValue("/nm/@openai/codex-plat/package.json"); + existsSyncMock.mockImplementation((p: string) => p.includes("/vendor/")); + const got = nativeCodexBinaryPath(undefined); + expect(got).toContain("@openai/codex-plat"); + expect(got).toContain("/vendor/"); + expect(got?.endsWith("/bin/codex")).toBe(true); + }); + + it("returns undefined when neither the sibling nor the @openai/codex dep is present", () => { + existsSyncMock.mockReturnValue(false); + resolveMock.mockImplementation(() => { + throw new Error("Cannot find module '@openai/codex-plat/package.json'"); + }); + expect( + nativeCodexBinaryPath("/bundle/codex-acp/codex-acp"), + ).toBeUndefined(); + expect(nativeCodexBinaryPath(undefined)).toBeUndefined(); + }); }); diff --git a/packages/agent/src/adapters/codex-app-server/binary-path.ts b/packages/agent/src/adapters/codex-app-server/binary-path.ts index c025522cd2..6af43597cb 100644 --- a/packages/agent/src/adapters/codex-app-server/binary-path.ts +++ b/packages/agent/src/adapters/codex-app-server/binary-path.ts @@ -1,17 +1,82 @@ import { existsSync } from "node:fs"; +import { createRequire } from "node:module"; import { dirname, join } from "node:path"; /** - * The native codex CLI is bundled next to codex-acp, so derive its path from - * the codex-acp binary path (same directory, `codex` instead of `codex-acp`). - * Returns undefined when the binary isn't present (e.g. the npx fallback), in - * which case the caller keeps using the codex-acp adapter. + * Node `platform-arch` → codex target triple + `@openai/codex` platform sub-package + * that vendors the native binary. Mirrors `@openai/codex`'s own `bin/codex.js` shim. + */ +const CODEX_NATIVE_TARGETS: Record< + string, + { triple: string; pkg: string } | undefined +> = { + "linux-x64": { + triple: "x86_64-unknown-linux-musl", + pkg: "@openai/codex-linux-x64", + }, + "linux-arm64": { + triple: "aarch64-unknown-linux-musl", + pkg: "@openai/codex-linux-arm64", + }, + "darwin-x64": { + triple: "x86_64-apple-darwin", + pkg: "@openai/codex-darwin-x64", + }, + "darwin-arm64": { + triple: "aarch64-apple-darwin", + pkg: "@openai/codex-darwin-arm64", + }, + "win32-x64": { + triple: "x86_64-pc-windows-msvc", + pkg: "@openai/codex-win32-x64", + }, + "win32-arm64": { + triple: "aarch64-pc-windows-msvc", + pkg: "@openai/codex-win32-arm64", + }, +}; + +/** + * Resolve the native codex binary vendored by `@openai/codex`'s platform sub-package, + * so the adapter works from a plain `npm install @posthog/agent` with no download. + * Returns undefined when the dep or this platform's sub-package isn't installed. + */ +function vendoredCodexBinary(): string | undefined { + const target = CODEX_NATIVE_TARGETS[`${process.platform}-${process.arch}`]; + if (!target) return undefined; + const binaryName = process.platform === "win32" ? "codex.exe" : "codex"; + try { + // Anchor resolution at this module's dir; the createRequire filename need not + // exist (only its directory is used). + const requireFrom = createRequire( + join(import.meta.dirname ?? __dirname, "_resolve.js"), + ); + const pkgJson = requireFrom.resolve(`${target.pkg}/package.json`); + const binary = join( + dirname(pkgJson), + "vendor", + target.triple, + "bin", + binaryName, + ); + return existsSync(binary) ? binary : undefined; + } catch { + return undefined; + } +} + +/** + * Path to the native codex CLI (the one that exposes `app-server`), or undefined + * when unavailable. Two sources in order: bundled next to codex-acp, then vendored + * by the `@openai/codex` npm dependency. */ export function nativeCodexBinaryPath( codexAcpPath?: string, ): string | undefined { - if (!codexAcpPath) return undefined; const binaryName = process.platform === "win32" ? "codex.exe" : "codex"; - const candidate = join(dirname(codexAcpPath), binaryName); - return existsSync(candidate) ? candidate : undefined; + if (codexAcpPath) { + const candidate = join(dirname(codexAcpPath), binaryName); + if (existsSync(candidate)) return candidate; + } + return vendoredCodexBinary(); } diff --git a/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.test.ts b/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.test.ts index 140c4abed1..bb2ad6be0a 100644 --- a/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.test.ts +++ b/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.test.ts @@ -11,6 +11,22 @@ import type { } from "./app-server-client"; import { CodexAppServerAgent } from "./codex-app-server-agent"; +// Required-field invariants the native codex app-server enforces on each request. +const REQUIRED_FIELDS: Record = { + "turn/interrupt": ["threadId", "turnId"], + "turn/steer": ["threadId", "input", "expectedTurnId"], +}; + +function requiredFieldMissing( + method: string, + params: unknown, +): string | undefined { + const p = (params ?? {}) as Record; + return REQUIRED_FIELDS[method]?.find( + (f) => p[f] === undefined || p[f] === null || p[f] === "", + ); +} + function makeStubRpc(responses: Record) { let handlers: AppServerClientHandlers | undefined; const requests: Array<{ method: string; params?: unknown }> = []; @@ -18,6 +34,14 @@ function makeStubRpc(responses: Record) { const rpc: AppServerRpc = { async request(method: string, params?: unknown): Promise { requests.push({ method, params }); + // Enforce the schema contract so a dropped required field fails loudly, not as a CI false-green. + const missing = requiredFieldMissing(method, params); + if (missing) { + throw { + code: -32600, + message: `Invalid request: missing field \`${missing}\``, + }; + } return (responses[method] ?? {}) as T; }, notify() {}, @@ -47,13 +71,17 @@ function makeFakeClient( outcome: unknown = { outcome: "selected", optionId: "allow" }, ) { const sessionUpdates: unknown[] = []; + const extNotifications: Array<{ method: string; params: unknown }> = []; const client = { sessionUpdate: async (notification: unknown) => { sessionUpdates.push(notification); }, requestPermission: async () => ({ outcome }), + extNotification: async (method: string, params: unknown) => { + extNotifications.push({ method, params }); + }, } as unknown as AgentSideConnection; - return { client, sessionUpdates }; + return { client, sessionUpdates, extNotifications }; } const init = { protocolVersion: 1 } as unknown as InitializeRequest; @@ -83,7 +111,7 @@ describe("CodexAppServerAgent", () => { prompt: [{ type: "text", text: "hello" }], } as unknown as PromptRequest); - stub.emit("item/agentMessage/delta", { itemId: "i1", text: "Hi there" }); + stub.emit("item/agentMessage/delta", { itemId: "i1", delta: "Hi there" }); stub.emit("turn/completed", { turn: { id: "turn_1", status: "completed" }, }); @@ -105,160 +133,1779 @@ describe("CodexAppServerAgent", () => { }); }); - it("maps a failed turn to a refusal stop reason", async () => { - const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); - const { client } = makeFakeClient(); + it("enriches an MCP tool-call approval with the structured posthog channel", async () => { + const stub = makeStubRpc({ + initialize: {}, + "thread/start": { thread: { id: "thr_1" } }, + }); + const permissionToolCalls: unknown[] = []; + const client = { + sessionUpdate: async () => {}, + requestPermission: async (params: { toolCall: unknown }) => { + permissionToolCalls.push(params.toolCall); + return { outcome: { outcome: "selected", optionId: "allow" } }; + }, + extNotification: async () => {}, + } as unknown as AgentSideConnection; + const agent = new CodexAppServerAgent(client, { - processOptions: { binaryPath: "/x/codex" }, + processOptions: { binaryPath: "/bundle/codex" }, + model: "gpt-5.5", rpcFactory: stub.factory, }); + await agent.initialize(init); + await agent.newSession({ cwd: "/repo" } as unknown as NewSessionRequest); - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); - const done = agent.prompt({ - sessionId: "t", - prompt: [], - } as unknown as PromptRequest); - stub.emit("turn/completed", { turn: { status: "failed" } }); + // The MCP tool call item arrives first, then codex approves it via a command-execution request. + stub.emit("item/started", { + item: { + type: "mcpToolCall", + id: "m1", + server: "posthog", + tool: "exec", + arguments: { command: "call execute-sql {}" }, + }, + }); + const decision = await stub.invokeRequest( + "item/commandExecution/requestApproval", + { + itemId: "m1", + command: 'Allow the posthog MCP server to run tool "exec"?', + }, + ); - expect((await done).stopReason).toBe("refusal"); + expect(decision).toEqual({ decision: "accept" }); + expect(permissionToolCalls).toHaveLength(1); + expect(permissionToolCalls[0]).toMatchObject({ + toolCallId: "m1", + kind: "other", + rawInput: { command: "call execute-sql {}" }, + _meta: { + posthog: { + toolName: "mcp__posthog__exec", + mcp: { server: "posthog", tool: "exec" }, + }, + }, + }); }); - it("routes command approvals to the host and maps allow to accept", async () => { - const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); - const { client } = makeFakeClient(); + it("enriches the MCP elicitation approval (posthog exec) from the in-flight tool call", async () => { + // codex gates PostHog `exec` behind a generic elicitation (serverName only, no tool/args); + // the adapter correlates it to the in-flight mcpToolCall so the real tool + command render. + const stub = makeStubRpc({ + initialize: {}, + "thread/start": { thread: { id: "thr_1" } }, + }); + const permissionToolCalls: Array> = []; + const client = { + sessionUpdate: async () => {}, + requestPermission: async (params: { + toolCall: Record; + }) => { + permissionToolCalls.push(params.toolCall); + return { outcome: { outcome: "selected", optionId: "accept" } }; + }, + extNotification: async () => {}, + } as unknown as AgentSideConnection; + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/bundle/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + await agent.initialize(init); + await agent.newSession({ cwd: "/repo" } as unknown as NewSessionRequest); + + stub.emit("item/started", { + item: { + type: "mcpToolCall", + id: "m1", + server: "posthog", + tool: "exec", + arguments: { command: "call execute-sql {}" }, + }, + }); + const decision = await stub.invokeRequest("mcpServer/elicitation/request", { + threadId: "thr_1", + turnId: "turn_1", + serverName: "posthog", + mode: "form", + message: 'Allow the posthog MCP server to run tool "exec"?', + }); + + expect(decision).toMatchObject({ action: "accept" }); + expect(permissionToolCalls[0]).toMatchObject({ + toolCallId: "posthog:elicitation", + rawInput: { command: "call execute-sql {}" }, + _meta: { + posthog: { + toolName: "mcp__posthog__exec", + mcp: { server: "posthog", tool: "exec" }, + }, + }, + }); + }); + + function makeApprovalAgent(chooseOptionId = "allow") { + const stub = makeStubRpc({ + initialize: {}, + "thread/start": { thread: { id: "thr_1" } }, + }); + const permissionToolCalls: Array> = []; + const permissionOptions: Array< + Array<{ optionId?: string; kind?: string }> + > = []; + const client = { + sessionUpdate: async () => {}, + requestPermission: async (params: { + toolCall: Record; + options: Array<{ optionId?: string; kind?: string }>; + }) => { + permissionToolCalls.push(params.toolCall); + permissionOptions.push(params.options); + return { outcome: { outcome: "selected", optionId: chooseOptionId } }; + }, + extNotification: async () => {}, + } as unknown as AgentSideConnection; + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/bundle/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + return { agent, stub, permissionToolCalls, permissionOptions }; + } + + it("routes a non-MCP command approval to an execute permission (kind + command body)", async () => { + // kind:"execute" + command text content makes the host render ExecutePermission (not the fallback). + const { agent, stub, permissionToolCalls } = makeApprovalAgent(); + await agent.initialize(init); + await agent.newSession({ cwd: "/repo" } as unknown as NewSessionRequest); + + await stub.invokeRequest("item/commandExecution/requestApproval", { + itemId: "c1", + command: "rm -rf build", + }); + + expect(permissionToolCalls).toHaveLength(1); + expect(permissionToolCalls[0]).toEqual({ + toolCallId: "c1", + title: "rm -rf build", + kind: "execute", + content: [ + { type: "content", content: { type: "text", text: "rm -rf build" } }, + ], + }); + }); + + it("surfaces Allow-always and echoes codex's remember decision when offered", async () => { + const { agent, stub, permissionOptions } = + makeApprovalAgent("allow_always"); + await agent.initialize(init); + await agent.newSession({ cwd: "/repo" } as unknown as NewSessionRequest); + + // codex offers the command-prefix allowlist decision for this approval. + const decision = await stub.invokeRequest( + "item/commandExecution/requestApproval", + { + itemId: "c1", + command: "pnpm test", + available_decisions: ["approved_execpolicy_amendment", "denied"], + }, + ); + + expect(permissionOptions[0].map((o) => o.kind)).toContain("allow_always"); + // Picking it echoes codex's own decision so it applies the amendment. + expect(decision).toEqual({ decision: "approved_execpolicy_amendment" }); + }); + + it("omits Allow-always when codex offers no remember decision", async () => { + const { agent, stub, permissionOptions } = makeApprovalAgent("allow"); + await agent.initialize(init); + await agent.newSession({ cwd: "/repo" } as unknown as NewSessionRequest); + + const decision = await stub.invokeRequest( + "item/commandExecution/requestApproval", + { itemId: "c1", command: "ls" }, + ); + + expect(permissionOptions[0].map((o) => o.kind)).not.toContain( + "allow_always", + ); + expect(permissionOptions[0].map((o) => o.optionId)).toEqual([ + "allow", + "reject", + "reject_with_feedback", + ]); + expect(decision).toEqual({ decision: "accept" }); + }); + + it("reject-with-feedback declines and steers the user's guidance into the running turn", async () => { + const stub = makeStubRpc({ + initialize: {}, + "thread/start": { thread: { id: "thr_1" } }, + "turn/start": { turn: { id: "turn_1" } }, + // codex rotates the turn id on steer. + "turn/steer": { turnId: "turn_2" }, + }); + const offeredOptions: Array> = + []; + const client = { + sessionUpdate: async () => {}, + requestPermission: async (params: { + options: Array<{ optionId?: string; kind?: string }>; + }) => { + offeredOptions.push(params.options); + return { + outcome: { outcome: "selected", optionId: "reject_with_feedback" }, + _meta: { customInput: "use the SDK instead of shelling out" }, + }; + }, + extNotification: async () => {}, + } as unknown as AgentSideConnection; const agent = new CodexAppServerAgent(client, { processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", rpcFactory: stub.factory, }); + await agent.initialize(init); + await agent.newSession({ cwd: "/repo" } as unknown as NewSessionRequest); + // Start a turn so there's a live turnId for the steer to target. + const done = agent.prompt({ + sessionId: "thr_1", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + stub.emit("turn/started", { turn: { id: "turn_1" } }); - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + // codex asks to run a command mid-turn; user rejects with guidance. const decision = await stub.invokeRequest( "item/commandExecution/requestApproval", - { itemId: "i", command: "ls -la" }, + { itemId: "c1", command: "rm -rf build" }, + ); + + expect(decision).toEqual({ decision: "decline" }); + const feedbackOpt = offeredOptions[0].find( + (o) => o.optionId === "reject_with_feedback", ); + expect(feedbackOpt).toBeTruthy(); + // The guidance was steered into the running turn as a follow-up message. + const steer = stub.requests.find((r) => r.method === "turn/steer"); + expect((steer?.params as { expectedTurnId?: string })?.expectedTurnId).toBe( + "turn_1", + ); + + // The rotated turn id from the steer response was adopted: a second + // rejection targets turn_2, not the dead turn_1. + await new Promise((r) => setImmediate(r)); + await stub.invokeRequest("item/commandExecution/requestApproval", { + itemId: "c2", + command: "rm -rf dist", + }); + const steers = stub.requests.filter((r) => r.method === "turn/steer"); + expect( + (steers[1]?.params as { expectedTurnId?: string })?.expectedTurnId, + ).toBe("turn_2"); - expect(decision).toBe("accept"); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; }); - it("rejects the pending turn when the app-server stream closes", async () => { + it("routes a non-MCP file-change approval to an edit permission (kind + diff + locations)", async () => { + const { agent, stub, permissionToolCalls } = makeApprovalAgent(); + await agent.initialize(init); + await agent.newSession({ cwd: "/repo" } as unknown as NewSessionRequest); + + await stub.invokeRequest("item/fileChange/requestApproval", { + itemId: "f1", + changes: [{ path: "src/a.ts", diff: "@@ -1 +1 @@\n-old\n+new\n" }], + }); + + expect(permissionToolCalls).toHaveLength(1); + const tc = permissionToolCalls[0]; + expect(tc.kind).toBe("edit"); + expect(tc.locations).toEqual([{ path: "src/a.ts" }]); + // A diff content block so the host's EditPermission renders the change. + expect(Array.isArray(tc.content)).toBe(true); + expect((tc.content as Array<{ type?: string }>)[0]?.type).toBe("diff"); + }); + + it("passes outputSchema to turn/start and fires onStructuredOutput", async () => { const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); const { client } = makeFakeClient(); + const outputs: Array> = []; + const schema = { + type: "object", + properties: { repo: { type: "string" } }, + required: ["repo"], + }; const agent = new CodexAppServerAgent(client, { processOptions: { binaryPath: "/x/codex" }, rpcFactory: stub.factory, + onStructuredOutput: async (o) => { + outputs.push(o); + }, }); - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + await agent.newSession({ + cwd: "/r", + _meta: { jsonSchema: schema }, + } as unknown as NewSessionRequest); const done = agent.prompt({ sessionId: "t", - prompt: [{ type: "text", text: "hi" }], + prompt: [{ type: "text", text: "pick a repo" }], } as unknown as PromptRequest); - stub.triggerClose(); + // The schema-constrained final message is pure JSON. + stub.emit("item/completed", { + item: { + type: "agentMessage", + id: "a1", + text: '{"repo":"posthog/posthog"}', + }, + }); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; - await expect(done).rejects.toThrow(/exited before the turn completed/); + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + expect(turnStart?.params).toMatchObject({ outputSchema: schema }); + expect(outputs).toEqual([{ repo: "posthog/posthog" }]); }); - it("interrupts by sending turn/interrupt before reporting cancelled", async () => { + it("injects task instructions and mcp_servers into thread/start", async () => { const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); const { client } = makeFakeClient(); const agent = new CodexAppServerAgent(client, { - processOptions: { binaryPath: "/x/codex" }, + processOptions: { + binaryPath: "/x/codex", + developerInstructions: "Codex guidance.", + }, rpcFactory: stub.factory, }); - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); - const done = agent.prompt({ - sessionId: "t", - prompt: [], - } as unknown as PromptRequest); + await agent.newSession({ + cwd: "/r", + _meta: { systemPrompt: "You are a repo selector." }, + mcpServers: [ + { + name: "posthog", + command: "node", + args: ["server.js"], + env: [{ name: "TOKEN", value: "abc" }], + }, + ], + } as unknown as NewSessionRequest); - await agent.cancel({ sessionId: "t" }); + const threadStart = stub.requests.find((r) => r.method === "thread/start"); + expect(threadStart?.params).toMatchObject({ + developerInstructions: "Codex guidance.\n\nYou are a repo selector.", + config: { + mcp_servers: { + posthog: { + command: "node", + args: ["server.js"], + env: { TOKEN: "abc" }, + }, + }, + }, + }); + }); - expect((await done).stopReason).toBe("cancelled"); - expect(stub.requests.some((r) => r.method === "turn/interrupt")).toBe(true); + it("flattens the host's {append} systemPrompt and dedupes it against developerInstructions", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { + binaryPath: "/x/codex", + // The host pre-flattens into developerInstructions AND sends the raw {append} form. + developerInstructions: "Be a careful engineer.", + }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ + cwd: "/r", + _meta: { systemPrompt: { append: "Be a careful engineer." } }, + } as unknown as NewSessionRequest); + + const threadStart = stub.requests.find((r) => r.method === "thread/start"); + // {append} is flattened (not "[object Object]") and, being identical, deduped to one copy. + expect( + (threadStart?.params as { developerInstructions?: string }) + .developerInstructions, + ).toBe("Be a careful engineer."); }); - it("rejects a concurrent prompt while a turn is in progress", async () => { + it("appends a distinct {append} systemPrompt to developerInstructions", async () => { const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); const { client } = makeFakeClient(); const agent = new CodexAppServerAgent(client, { - processOptions: { binaryPath: "/x/codex" }, + processOptions: { + binaryPath: "/x/codex", + developerInstructions: "Codex base guidance.", + }, rpcFactory: stub.factory, }); - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); - const first = agent.prompt({ + await agent.newSession({ + cwd: "/r", + _meta: { systemPrompt: { append: "Task: fix the bug." } }, + } as unknown as NewSessionRequest); + + const threadStart = stub.requests.find((r) => r.method === "thread/start"); + expect( + (threadStart?.params as { developerInstructions?: string }) + .developerInstructions, + ).toBe("Codex base guidance.\n\nTask: fix the bug."); + }); + + it("honors the host's initial _meta.permissionMode (read-only) in turn/start", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ + cwd: "/r", + _meta: { permissionMode: "read-only" }, + } as unknown as NewSessionRequest); + const done = agent.prompt({ sessionId: "t", - prompt: [], + prompt: [{ type: "text", text: "go" }], } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; - await expect( - agent.prompt({ sessionId: "t", prompt: [] } as unknown as PromptRequest), - ).rejects.toThrow(/already in progress/); + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + // read-only maps to approvalPolicy "untrusted" (mirrors codex-acp). + expect( + (turnStart?.params as { approvalPolicy?: string }).approvalPolicy, + ).toBe("untrusted"); + }); + it("falls back to auto for a non-codex initial permissionMode", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + // "bypassPermissions" is a Claude mode, not a codex mode → default "auto". + await agent.newSession({ + cwd: "/r", + _meta: { permissionMode: "bypassPermissions" }, + } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); stub.emit("turn/completed", { turn: { status: "completed" } }); - await first; + await done; + + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + expect( + (turnStart?.params as { approvalPolicy?: string }).approvalPolicy, + ).toBe("on-request"); }); - it("runs sequential turns on the same session", async () => { - const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + it("applies a read-only sandboxPolicy + approvalPolicy when the picker is Plan", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); const { client } = makeFakeClient(); const agent = new CodexAppServerAgent(client, { processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", rpcFactory: stub.factory, }); - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); - - const first = agent.prompt({ + await agent.setSessionConfigOption({ + configId: "mode", + value: "plan", sessionId: "t", - prompt: [{ type: "text", text: "one" }], + } as never); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], } as unknown as PromptRequest); stub.emit("turn/completed", { turn: { status: "completed" } }); - expect((await first).stopReason).toBe("end_turn"); + await done; - const second = agent.prompt({ + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + const params = turnStart?.params as { + sandboxPolicy?: unknown; + approvalPolicy?: string; + collaborationMode?: unknown; + }; + // Plan engages codex's plan collaboration AND blocks edits via a read-only sandbox. + expect(params.collaborationMode).toEqual({ + mode: "plan", + settings: { model: "gpt-5.5" }, + }); + expect(params.sandboxPolicy).toEqual({ + type: "readOnly", + networkAccess: true, + }); + expect(params.approvalPolicy).toBe("on-request"); + }); + + it("omits sandboxPolicy for an editing preset (auto) so the spawned full-access stays", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + // Default mode is "auto" → editing allowed, no sandbox override. + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ sessionId: "t", - prompt: [{ type: "text", text: "two" }], + prompt: [{ type: "text", text: "go" }], } as unknown as PromptRequest); stub.emit("turn/completed", { turn: { status: "completed" } }); - expect((await second).stopReason).toBe("end_turn"); + await done; + + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + const params = turnStart?.params as { + sandboxPolicy?: unknown; + collaborationMode?: unknown; + }; + expect(params.sandboxPolicy).toBeUndefined(); + // Default collaboration is pushed every turn so switching back from Plan reverts. + expect(params.collaborationMode).toEqual({ + mode: "default", + settings: { model: "gpt-5.5" }, + }); }); - it("maps a rejected approval to decline", async () => { - const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); - const { client } = makeFakeClient({ - outcome: "selected", - optionId: "reject", + it("returns mode + model + thought_level configOptions and emits config_option_update", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "model/list": { + data: [ + { + id: "gpt-5.5", + model: "gpt-5.5", + displayName: "GPT-5.5", + hidden: false, + supportedReasoningEfforts: [ + { reasoningEffort: "low" }, + { reasoningEffort: "high" }, + ], + }, + ], + }, }); + const { client, sessionUpdates } = makeFakeClient(); const agent = new CodexAppServerAgent(client, { processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", rpcFactory: stub.factory, }); - - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const session = await agent.newSession({ + cwd: "/r", + } as unknown as NewSessionRequest); + const opts = (session.configOptions ?? []) as any[]; + expect(opts.map((o) => o.category)).toEqual([ + "mode", + "model", + "thought_level", + ]); expect( - await stub.invokeRequest("item/fileChange/requestApproval", { - itemId: "i", - }), - ).toBe("decline"); + opts.find((o) => o.category === "mode").options.map((x: any) => x.value), + ).toEqual(["plan", "read-only", "auto", "full-access"]); + expect( + opts + .find((o) => o.category === "thought_level") + .options.map((x: any) => x.value), + ).toEqual(["low", "high"]); + expect( + sessionUpdates.some( + (u: any) => u.update?.sessionUpdate === "config_option_update", + ), + ).toBe(true); }); - it("maps a cancelled approval to cancel", async () => { - const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); - const { client } = makeFakeClient({ outcome: "cancelled" }); + it("drops Claude models from the picker and falls back to the codex effort map when model/list reports none", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "model/list": { + data: [ + { + id: "gpt-5.5", + model: "gpt-5.5", + displayName: "GPT-5.5", + hidden: false, + // The PostHog gateway populates no efforts (defaultReasoningEffort:"none"). + supportedReasoningEfforts: [], + }, + { + // The gateway also serves Claude models — they must not leak into the picker. + id: "claude-opus-4-8", + model: "claude-opus-4-8", + hidden: false, + supportedReasoningEfforts: [], + }, + ], + }, + }); + const { client } = makeFakeClient(); const agent = new CodexAppServerAgent(client, { processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", rpcFactory: stub.factory, }); + const session = await agent.newSession({ + cwd: "/r", + } as unknown as NewSessionRequest); + const opts = (session.configOptions ?? []) as any[]; - await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); expect( - await stub.invokeRequest("item/commandExecution/requestApproval", { - itemId: "i", - command: "ls", - }), - ).toBe("cancel"); + opts.find((o) => o.category === "model").options.map((x: any) => x.value), + ).toEqual(["gpt-5.5"]); + // No live efforts → shared codex map, which exposes xhigh for the gpt-5.5 family. + expect( + opts + .find((o) => o.category === "thought_level") + .options.map((x: any) => x.value), + ).toContain("xhigh"); + }); + + it("setSessionConfigOption switches the model and re-emits config", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const res = await agent.setSessionConfigOption({ + configId: "model", + value: "gpt-6", + sessionId: "t", + } as any); + const modelOpt = (res.configOptions as any[]).find( + (o) => o.category === "model", + ); + expect(modelOpt.currentValue).toBe("gpt-6"); + }); + + it("sends activePermissionProfile :read-only on turn/start in read-only mode", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + await agent.setSessionConfigOption({ + configId: "mode", + value: "read-only", + sessionId: "t", + } as any); + + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "look around" }], + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; + + // codex 0.140.0 enforces the sandbox via the named profile, so read-only MUST send it alongside sandboxPolicy. + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + expect(turnStart?.params).toMatchObject({ + activePermissionProfile: { extends: ":read-only" }, + sandboxPolicy: { type: "readOnly" }, + }); + }); + + it("resumeSession resumes the existing thread and returns configOptions", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t1" } }, + "thread/resume": { thread: { id: "t1" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const res = await agent.resumeSession({ + sessionId: "t1", + cwd: "/r", + mcpServers: [], + } as any); + const resumeReq = stub.requests.find((r) => r.method === "thread/resume"); + expect(resumeReq?.params).toMatchObject({ threadId: "t1" }); + expect((res.configOptions as any[]).length).toBeGreaterThan(0); + }); + + it("listSessions maps thread/list to ACP sessions", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "thread/list": { + data: [ + { id: "t1", cwd: "/r", name: "Task 1" }, + { id: "t2", cwd: "/r2" }, + ], + }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const res = await agent.listSessions({ cwd: "/r" } as any); + expect(res.sessions).toEqual([ + { sessionId: "t1", cwd: "/r", title: "Task 1" }, + { sessionId: "t2", cwd: "/r2" }, + ]); + }); + + it("forkSession forks and returns a session id", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t1" } }, + "thread/fork": { thread: { id: "t2" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const res = await agent.unstable_forkSession({ + sessionId: "t1", + cwd: "/r", + mcpServers: [], + } as any); + expect(res.sessionId).toBe("t2"); + }); + + it("maps a failed turn to a refusal stop reason", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "failed" } }); + + expect((await done).stopReason).toBe("refusal"); + }); + + it("maps an interrupted turn to cancelled", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "interrupted" } }); + + expect((await done).stopReason).toBe("cancelled"); + }); + + it("finalizes the turn on a non-retried error notification", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + // willRetry:false must resolve the turn rather than hang until stream close. + stub.emit("error", { willRetry: false, error: { message: "boom" } }); + + expect((await done).stopReason).toBe("refusal"); + }); + + it("ends the turn without turn/start when no prompt block is usable", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const res = await agent.prompt({ + sessionId: "t", + prompt: [{ type: "audio", data: "AAAA", mimeType: "audio/wav" }], + } as unknown as PromptRequest); + + expect(res.stopReason).toBe("end_turn"); + expect(stub.requests.some((r) => r.method === "turn/start")).toBe(false); + }); + + it("finalizes a turn once when error and turn/completed both arrive", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const outputs: Array> = []; + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + onStructuredOutput: async (o) => { + outputs.push(o); + }, + }); + const schema = { + type: "object", + properties: { ok: { type: "boolean" } }, + required: ["ok"], + }; + + await agent.newSession({ + cwd: "/r", + _meta: { jsonSchema: schema, taskRunId: "run_x" }, + } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + + stub.emit("item/completed", { + item: { type: "agentMessage", id: "a1", text: '{"ok":true}' }, + }); + // error + turn/completed for one turn must not double-fire turn_complete (idempotent). + stub.emit("error", { willRetry: false, error: { message: "boom" } }); + stub.emit("turn/completed", { turn: { status: "failed" } }); + await done; + + // Structured output is gated on a clean end_turn: a refused turn records nothing. + expect(outputs).toEqual([]); + expect( + extNotifications.filter((n) => n.method === "_posthog/turn_complete") + .length, + ).toBe(1); + }); + + it("routes command approvals to the host and maps allow to a decision envelope", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const decision = await stub.invokeRequest( + "item/commandExecution/requestApproval", + { itemId: "i", command: "ls -la" }, + ); + + expect(decision).toEqual({ decision: "accept" }); + }); + + it("rejects the pending turn when the app-server stream closes", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "hi" }], + } as unknown as PromptRequest); + + stub.triggerClose(); + + await expect(done).rejects.toThrow(/exited before the turn completed/); + }); + + it("interrupts by sending turn/interrupt with the live threadId + turnId", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + // turn/started carries the live turnId the server REQUIRES on turn/interrupt (else -32600). + stub.emit("turn/started", { turn: { id: "turn_1" } }); + + await agent.cancel({ sessionId: "t" }); + + expect((await done).stopReason).toBe("cancelled"); + const req = stub.requests.find((r) => r.method === "turn/interrupt"); + expect(req?.params).toEqual({ threadId: "t", turnId: "turn_1" }); + }); + + it("a cancelled turn's late completion does not cancel the follow-up turn", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + + // Turn 1, then cancel it (records turn_1 as interrupted). + const first = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + stub.emit("turn/started", { turn: { id: "turn_1" } }); + await agent.cancel({ sessionId: "t" }); + expect((await first).stopReason).toBe("cancelled"); + + // Follow-up turn 2. + const second = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "again" }], + } as unknown as PromptRequest); + stub.emit("turn/started", { turn: { id: "turn_2" } }); + // The cancelled turn's late completion arrives during turn 2 — it must be ignored. + stub.emit("turn/completed", { + turn: { id: "turn_1", status: "interrupted" }, + }); + stub.emit("turn/completed", { + turn: { id: "turn_2", status: "completed" }, + }); + expect((await second).stopReason).toBe("end_turn"); + }); + + it("emits _posthog/turn_complete with cancelled on interrupt (matches codex-acp)", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ + cwd: "/r", + _meta: { taskRunId: "run_c" }, + } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + // Emit turn/started so the interrupt actually reaches the binary (else false-green on local finalize). + stub.emit("turn/started", { turn: { id: "turn_1" } }); + await agent.cancel({ sessionId: "t" }); + + expect((await done).stopReason).toBe("cancelled"); + // The interrupt RPC was genuinely sent (not just locally finalized)... + expect( + stub.requests.find((r) => r.method === "turn/interrupt")?.params, + ).toEqual({ threadId: "t", turnId: "turn_1" }); + // ...and a cancelled turn still emits the cloud idle signal, exactly once. + const tcs = extNotifications.filter( + (n) => n.method === "_posthog/turn_complete", + ); + expect(tcs).toHaveLength(1); + expect((tcs[0].params as { stopReason?: string }).stopReason).toBe( + "cancelled", + ); + }); + + it("skips turn/interrupt (but still finalizes cancelled) when no turn/started arrived", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + // No turn/started → no turnId: interrupt() must skip the RPC (else -32600) and still finalize. + await agent.cancel({ sessionId: "t" }); + + expect((await done).stopReason).toBe("cancelled"); + expect(stub.requests.some((r) => r.method === "turn/interrupt")).toBe( + false, + ); + }); + + it("rejects a concurrent prompt while a turn is in progress", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const first = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + + await expect( + agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "again" }], + } as unknown as PromptRequest), + ).rejects.toThrow(/already in progress/); + + stub.emit("turn/completed", { turn: { status: "completed" } }); + await first; + }); + + it("runs sequential turns on the same session", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + + const first = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "one" }], + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "completed" } }); + expect((await first).stopReason).toBe("end_turn"); + + const second = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "two" }], + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "completed" } }); + expect((await second).stopReason).toBe("end_turn"); + }); + + it("maps a rejected approval to decline", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient({ + outcome: "selected", + optionId: "reject", + }); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + expect( + await stub.invokeRequest("item/fileChange/requestApproval", { + itemId: "i", + }), + ).toEqual({ decision: "decline" }); + }); + + it("maps a cancelled approval to cancel", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient({ outcome: "cancelled" }); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + expect( + await stub.invokeRequest("item/commandExecution/requestApproval", { + itemId: "i", + command: "ls", + }), + ).toEqual({ decision: "cancel" }); + }); + + it("folds a mid-turn prompt into the running turn via turn/steer", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const first = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "one" }], + } as unknown as PromptRequest); + + // The active turn id arrives via turn/started; it's the steer precondition. + stub.emit("turn/started", { threadId: "t", turn: { id: "turn_1" } }); + + const second = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "more context" }], + } as unknown as PromptRequest); + + // The single turn/completed resolves both the original and the folded prompt. + stub.emit("turn/completed", { turn: { status: "completed" } }); + expect((await first).stopReason).toBe("end_turn"); + expect((await second).stopReason).toBe("end_turn"); + + const steer = stub.requests.find((r) => r.method === "turn/steer"); + expect(steer?.params).toMatchObject({ + threadId: "t", + expectedTurnId: "turn_1", + input: [{ type: "text", text: "more context" }], + }); + // Only one turn/start — the second prompt steered rather than starting anew. + expect(stub.requests.filter((r) => r.method === "turn/start")).toHaveLength( + 1, + ); + }); + + it("refreshes the live turnId from each turn/steer response", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + "turn/steer": { turnId: "turn_2" }, // the server rotates the active turn id + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const first = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "one" }], + } as unknown as PromptRequest); + stub.emit("turn/started", { turn: { id: "turn_1" } }); + + const second = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "two" }], + } as unknown as PromptRequest); + // Let the first steer's rotated turnId apply before the next steer reads it. + await new Promise((r) => setTimeout(r, 0)); + const third = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "three" }], + } as unknown as PromptRequest); + + stub.emit("turn/completed", { turn: { status: "completed" } }); + await Promise.all([first, second, third]); + + const steers = stub.requests.filter((r) => r.method === "turn/steer"); + expect(steers).toHaveLength(2); + expect( + (steers[0].params as { expectedTurnId?: string }).expectedTurnId, + ).toBe("turn_1"); + // After the first steer rotated the id, the second steer must target turn_2. + expect( + (steers[1].params as { expectedTurnId?: string }).expectedTurnId, + ).toBe("turn_2"); + }); + + it("omits disabled skills from available_commands_update", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "skills/list": { + data: [ + { + skills: [ + { name: "deploy", description: "Deploy", enabled: true }, + { name: "danger", description: "Disabled", enabled: false }, + ], + }, + ], + }, + }); + const { client, sessionUpdates } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + + const cmds = ( + sessionUpdates.find( + (u: any) => u.update?.sessionUpdate === "available_commands_update", + ) as any + )?.update?.availableCommands; + expect(cmds.map((c: { name: string }) => c.name)).toEqual(["deploy"]); + }); + + it("emits _posthog/sdk_session when a taskRunId is present", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "thr_x" } } }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ + cwd: "/r", + _meta: { taskRunId: "run_42" }, + } as unknown as NewSessionRequest); + + expect(extNotifications).toContainEqual({ + method: "_posthog/sdk_session", + params: { taskRunId: "run_42", sessionId: "thr_x", adapter: "codex" }, + }); + }); + + it("does not emit _posthog/sdk_session without a taskRunId", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + expect( + extNotifications.some((n) => n.method === "_posthog/sdk_session"), + ).toBe(false); + }); + + it("emits _posthog/turn_complete and usage breakdown on turn completion", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ + cwd: "/r", + _meta: { taskRunId: "run_1", systemPrompt: "be terse" }, + } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "hi" }], + } as unknown as PromptRequest); + + stub.emit("thread/tokenUsage/updated", { + threadId: "t", + turnId: "turn_1", + tokenUsage: { + total: { + totalTokens: 100, + inputTokens: 60, + cachedInputTokens: 10, + outputTokens: 30, + reasoningOutputTokens: 5, + }, + modelContextWindow: 200000, + }, + }); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; + + const turnComplete = extNotifications.find( + (n) => n.method === "_posthog/turn_complete", + ); + expect(turnComplete?.params).toMatchObject({ + sessionId: "t", + stopReason: "end_turn", + usage: { + inputTokens: 60, + outputTokens: 30, + cachedReadTokens: 10, + cachedWriteTokens: 0, + totalTokens: 100, + }, + }); + // The breakdown variant carries a per-source `breakdown`, not `used`. + const breakdown = extNotifications.find( + (n) => + n.method === "_posthog/usage_update" && + (n.params as { breakdown?: unknown }).breakdown, + ); + expect(breakdown).toBeDefined(); + }); + + it("context-usage indicator reports the latest turn, not the cumulative thread total", async () => { + // The window-occupancy indicator must track `last`, not the cumulative `total` + // (which over-reports the window as filling from accumulation alone). + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ + cwd: "/r", + _meta: { taskRunId: "run_ctx" }, + } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "hi" }], + } as unknown as PromptRequest); + + stub.emit("thread/tokenUsage/updated", { + tokenUsage: { + total: { + totalTokens: 433289, + inputTokens: 432636, + cachedInputTokens: 76928, + outputTokens: 595, + }, + last: { + totalTokens: 189075, + inputTokens: 111552, + cachedInputTokens: 76928, + outputTokens: 595, + }, + modelContextWindow: 997500, + }, + }); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; + + const usageUpdate = extNotifications.find( + (n) => + n.method === "_posthog/usage_update" && + typeof (n.params as { used?: unknown }).used === "number", + ); + // `used` is last.totalTokens (189075), NOT total.totalTokens (433289). + expect(usageUpdate?.params).toMatchObject({ + used: 189075, + size: 997500, + usage: { inputTokens: 111552, totalTokens: 189075 }, + }); + }); + + it("reports codex's per-turn `last` (not the cumulative total) in turn_complete", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ + cwd: "/r", + _meta: { taskRunId: "run_u" }, + } as unknown as NewSessionRequest); + + // We let `last` drive the per-turn number rather than diffing the cumulative `total`. + const t1 = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "a" }], + } as unknown as PromptRequest); + stub.emit("thread/tokenUsage/updated", { + tokenUsage: { + total: { inputTokens: 100, outputTokens: 50 }, + last: { inputTokens: 100, outputTokens: 50 }, + }, + }); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await t1; + + const t2 = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "b" }], + } as unknown as PromptRequest); + stub.emit("thread/tokenUsage/updated", { + tokenUsage: { + total: { inputTokens: 250, outputTokens: 120 }, + last: { inputTokens: 150, outputTokens: 70 }, + }, + }); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await t2; + + const tcs = extNotifications.filter( + (n) => n.method === "_posthog/turn_complete", + ); + expect(tcs).toHaveLength(2); + expect( + (tcs[0].params as { usage: Record }).usage, + ).toMatchObject({ + inputTokens: 100, + outputTokens: 50, + }); + // Turn 2 is codex's `last` (150/70) — NOT the cumulative total (250/120). + expect( + (tcs[1].params as { usage: Record }).usage, + ).toMatchObject({ + inputTokens: 150, + outputTokens: 70, + }); + }); + + it("signals compaction start (_posthog/status) when a contextCompaction item begins", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ + cwd: "/r", + _meta: {}, + } as unknown as NewSessionRequest); + + stub.emit("item/started", { + item: { type: "contextCompaction", id: "c1" }, + }); + + // Mirrors the Claude adapter — the host sets isCompacting (gates steer/queue). + const status = extNotifications.find((n) => n.method === "_posthog/status"); + expect(status?.params).toMatchObject({ + sessionId: "t", + status: "compacting", + }); + }); + + it("emits compact_boundary + a transcript marker when the compaction item completes", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client, extNotifications, sessionUpdates } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ + cwd: "/r", + _meta: {}, + } as unknown as NewSessionRequest); + + // The compaction item brackets it: started → in progress, completed → boundary. + stub.emit("item/started", { + item: { type: "contextCompaction", id: "c1" }, + }); + stub.emit("item/completed", { + item: { type: "contextCompaction", id: "c1", summary: "…" }, + }); + + // compact_boundary clears isCompacting + drains the host queue. + expect( + extNotifications.find((n) => n.method === "_posthog/compact_boundary") + ?.params, + ).toMatchObject({ sessionId: "t" }); + // ...and a user-visible marker lands in the transcript. + expect(sessionUpdates).toContainEqual({ + sessionId: "t", + update: { + sessionUpdate: "agent_message_chunk", + content: { type: "text", text: "\n\nContext compacted." }, + }, + }); + // Exactly one boundary — the dedupe flag prevents a double-emit. + expect( + extNotifications.filter((n) => n.method === "_posthog/compact_boundary"), + ).toHaveLength(1); + }); + + it("still emits compact_boundary when the turn dies mid-compaction (no stuck isCompacting)", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + await agent.newSession({ + cwd: "/r", + _meta: {}, + } as unknown as NewSessionRequest); + + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "go" }], + } as unknown as PromptRequest); + // A fatal error ends the turn before item/completed; the finalize-time recovery still fires the boundary. + stub.emit("item/started", { + item: { type: "contextCompaction", id: "c1" }, + }); + stub.emit("error", { willRetry: false, error: { message: "boom" } }); + await done; + + expect( + extNotifications.find((n) => n.method === "_posthog/compact_boundary") + ?.params, + ).toMatchObject({ sessionId: "t" }); + }); + + it("loadSession resumes the thread and returns configOptions", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t1" } }, + "thread/resume": { thread: { id: "t1" } }, + }); + const { client, extNotifications } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + + const res = await agent.loadSession({ + sessionId: "t1", + cwd: "/r", + mcpServers: [], + _meta: { taskRunId: "run_load" }, + } as unknown as Parameters[0]); + + const resumeReq = stub.requests.find((r) => r.method === "thread/resume"); + expect(resumeReq?.params).toMatchObject({ threadId: "t1" }); + expect((res.configOptions as any[]).length).toBeGreaterThan(0); + // loadSession replays sdk_session so post-reload task tracking still works. + expect(extNotifications).toContainEqual({ + method: "_posthog/sdk_session", + params: { taskRunId: "run_load", sessionId: "t1", adapter: "codex" }, + }); + }); + + it("loadSession replays the resumed thread's persisted transcript", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t1" } }, + "thread/resume": { + thread: { + id: "t1", + turns: [ + { + items: [ + { + type: "userMessage", + id: "u1", + content: [{ type: "text", text: "fix the bug" }], + }, + { + type: "commandExecution", + id: "c1", + command: "ls", + status: "completed", + }, + { type: "agentMessage", id: "a1", text: "fixed it" }, + ], + }, + ], + }, + }, + }); + const { client, sessionUpdates } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + model: "gpt-5.5", + rpcFactory: stub.factory, + }); + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + + await agent.loadSession({ + sessionId: "t1", + cwd: "/r", + mcpServers: [], + } as unknown as Parameters[0]); + + const kinds = (sessionUpdates as any[]).map((u) => u.update?.sessionUpdate); + expect(kinds).toEqual( + expect.arrayContaining([ + "user_message_chunk", + "tool_call", + "agent_message_chunk", + ]), + ); + expect(sessionUpdates).toContainEqual({ + sessionId: "t1", + update: { + sessionUpdate: "user_message_chunk", + content: { type: "text", text: "fix the bug" }, + }, + }); + }); + + it("forwards additionalDirectories to thread/start as writable_roots", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ + cwd: "/repo", + additionalDirectories: ["/repo/pkg-a", "/repo/pkg-b"], + } as unknown as NewSessionRequest); + + const threadStart = stub.requests.find((r) => r.method === "thread/start"); + expect(threadStart?.params).toMatchObject({ + config: { + sandbox_workspace_write: { + writable_roots: ["/repo/pkg-a", "/repo/pkg-b"], + }, + }, + }); + }); + + it("carries an image block through to turn/start input", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [ + { type: "text", text: "look at this" }, + { type: "image", data: "aGVsbG8=", mimeType: "image/png" }, + ], + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; + + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + expect(turnStart?.params).toMatchObject({ + input: [ + { type: "text", text: "look at this", text_elements: [] }, + { type: "image", url: "data:image/png;base64,aGVsbG8=" }, + ], + }); + }); + + it("prepends _meta.prContext to the forwarded turn input but not the echo", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client, sessionUpdates } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const done = agent.prompt({ + sessionId: "t", + prompt: [{ type: "text", text: "fix the bug" }], + _meta: { prContext: "PR #123 is open; review before editing." }, + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; + + // prContext is prepended to the FORWARDED prompt (parity with claude + codex-acp). + const turnStart = stub.requests.find((r) => r.method === "turn/start"); + expect( + (turnStart?.params as { input: Array<{ text?: string }> }).input, + ).toEqual([ + { + type: "text", + text: "PR #123 is open; review before editing.", + text_elements: [], + }, + { type: "text", text: "fix the bug", text_elements: [] }, + ]); + // The echoed user turn shows only the real message (no prContext prefix). + const echoes = (sessionUpdates as any[]).filter( + (u) => u.update?.sessionUpdate === "user_message_chunk", + ); + expect(echoes).toEqual([ + { + sessionId: "t", + update: { + sessionUpdate: "user_message_chunk", + content: { type: "text", text: "fix the bug" }, + }, + }, + ]); + }); + + it("echoes an image-only user turn as a user_message_chunk", async () => { + const stub = makeStubRpc({ + "thread/start": { thread: { id: "t" } }, + "turn/start": { turn: { id: "turn_1" } }, + }); + const { client, sessionUpdates } = makeFakeClient(); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const image = { type: "image", data: "aGVsbG8=", mimeType: "image/png" }; + const done = agent.prompt({ + sessionId: "t", + prompt: [image], + } as unknown as PromptRequest); + stub.emit("turn/completed", { turn: { status: "completed" } }); + await done; + + expect(sessionUpdates).toContainEqual({ + sessionId: "t", + update: { sessionUpdate: "user_message_chunk", content: image }, + }); + }); + + it("routes item/tool/requestUserInput through the richer-approval handler", async () => { + const stub = makeStubRpc({ "thread/start": { thread: { id: "t" } } }); + const { client } = makeFakeClient({ + outcome: "selected", + optionId: "option_0", + }); + const agent = new CodexAppServerAgent(client, { + processOptions: { binaryPath: "/x/codex" }, + rpcFactory: stub.factory, + }); + + await agent.newSession({ cwd: "/r" } as unknown as NewSessionRequest); + const response = await stub.invokeRequest("item/tool/requestUserInput", { + threadId: "t", + turnId: "turn_1", + itemId: "i1", + questions: [ + { + id: "q1", + header: "Pick", + question: "Which one?", + isOther: false, + isSecret: false, + options: [ + { label: "A", description: "" }, + { label: "B", description: "" }, + ], + }, + ], + autoResolutionMs: null, + }); + + // The richer handler returns a typed { answers } object, not a decision string. + expect(response).toEqual({ answers: { q1: { answers: ["A"] } } }); }); }); diff --git a/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.ts b/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.ts index 88797060eb..1008d89b22 100644 --- a/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.ts +++ b/packages/agent/src/adapters/codex-app-server/codex-app-server-agent.ts @@ -1,14 +1,25 @@ import type { AgentSideConnection, - ContentBlock, + ForkSessionRequest, + ForkSessionResponse, InitializeRequest, InitializeResponse, + ListSessionsRequest, + ListSessionsResponse, + LoadSessionRequest, + LoadSessionResponse, NewSessionRequest, NewSessionResponse, PromptRequest, PromptResponse, + ResumeSessionRequest, + ResumeSessionResponse, + SetSessionConfigOptionRequest, + SetSessionConfigOptionResponse, StopReason, } from "@agentclientprotocol/sdk"; +import { mcpToolKey, posthogToolMeta } from "@posthog/shared"; +import { POSTHOG_NOTIFICATIONS } from "../../acp-extensions"; import { DEFAULT_CODEX_MODEL } from "../../gateway-models"; import type { ProcessSpawnedCallback } from "../../types"; import { Logger } from "../../utils/logger"; @@ -17,25 +28,68 @@ import { nodeWritableToWebWritable, } from "../../utils/streams"; import { BaseAcpAgent, type BaseSettingsManager } from "../base-acp-agent"; +import { + type ContextBreakdownBaseline, + emptyBaseline, + estimateTokens, +} from "../claude/context-breakdown"; import { AppServerClient, type AppServerClientHandlers, type AppServerRpc, } from "./app-server-client"; -import { mapAppServerNotification } from "./mapping"; +import { handleServerRequest } from "./approvals"; +import { + type AccumulatedUsage, + buildSdkSessionParams, + buildTurnCompleteParams, + buildUsageBreakdownParams, +} from "./ext-notifications"; +import { toCodexInput } from "./input"; +import { buildLocalToolsServer, type LocalToolsMeta } from "./local-tools-mcp"; +import { + type AppServerItem, + changePaths, + diffContent, + mapAppServerNotification, + mapHistoryItem, +} from "./mapping"; +import { toCodexMcpServers } from "./mcp-config"; +import { McpManager } from "./mcp-manager"; import { APP_SERVER_METHODS, APP_SERVER_NOTIFICATIONS, APP_SERVER_REQUESTS, } from "./protocol"; +import { SessionConfigState } from "./session-config"; import { type CodexAppServerProcess, type CodexAppServerProcessOptions, spawnCodexAppServerProcess, } from "./spawn"; +import { TurnController } from "./turn-controller"; +import { UsageTracker } from "./usage-tracker"; + +type AppServerSessionMeta = { + // The host sends either a plain string or the Claude-style `{ append }` form. + systemPrompt?: string | { append?: string }; + jsonSchema?: Record | null; + permissionMode?: string; + taskRunId?: string; + taskId?: string; + persistence?: { taskId?: string }; + environment?: "local" | "cloud"; + channelMode?: boolean; + baseBranch?: string; +}; + +/** The subset of codex's `Thread` the adapter reads: id + persisted `turns` for history replay. */ +type AppServerThread = { + id?: string; + turns?: Array<{ items?: Parameters[1][] }>; +}; -// The native app-server owns its own configuration, so there is nothing for the -// host to manage. BaseAcpAgent only calls dispose() on this. +// The native app-server owns its config; BaseAcpAgent only calls dispose() on this. class NoopSettingsManager implements BaseSettingsManager { constructor(private cwd: string) {} dispose(): void {} @@ -50,36 +104,44 @@ class NoopSettingsManager implements BaseSettingsManager { export interface CodexAppServerAgentOptions { processOptions: CodexAppServerProcessOptions; - /** Model id passed to thread/start. */ model?: string; - /** Reasoning effort passed to turn/start. */ reasoningEffort?: string; processCallbacks?: ProcessSpawnedCallback; logger?: Logger; + onStructuredOutput?: (output: Record) => Promise; /** Test seam: build the JSON-RPC client (defaults to spawning the process). */ rpcFactory?: (handlers: AppServerClientHandlers) => AppServerRpc; } /** - * ACP Agent backed by the native Codex `app-server` protocol. Presents the same - * ACP surface to PostHog Code as the codex-acp adapter, but talks to Codex's own - * JSON-RPC protocol underneath instead of going through the Zed translation layer. - * - * Spike scope: covers the core lifecycle (initialize, thread/start, turn/start - * with streamed agent messages, interrupt, approvals). Resume/fork, tool-call - * rendering, structured output and usage accounting are follow-ups. + * ACP Agent backed by the native Codex `app-server` JSON-RPC protocol. Presents the + * same ACP surface to PostHog Code as the codex-acp adapter, without the Zed + * translation layer, and stays at parity with it on the adapter surface. */ export class CodexAppServerAgent extends BaseAcpAgent { readonly adapterName = "codex"; private readonly rpc: AppServerRpc; private readonly proc?: CodexAppServerProcess; - private readonly model: string; - private readonly reasoningEffort?: string; + private readonly config: SessionConfigState; + private readonly onStructuredOutput?: ( + output: Record, + ) => Promise; + /** Codex-specific guidance injected at spawn time; replayed per-thread. */ + private readonly developerInstructions?: string; private threadId?: string; - private pendingTurn?: { - resolve: (reason: StopReason) => void; - reject: (err: Error) => void; - }; + /** JSON schema constraining the final message; set per session via `_meta`. */ + private jsonSchema?: Record; + /** Final assistant message text for the in-flight turn (structured output). */ + private lastAgentMessage = ""; + /** True between a contextCompaction item's start and its boundary (dedupes the boundary). */ + private compactionActive = false; + /** Maps the host's taskRunId to this session, replayed for cloud notifications. */ + private taskRunId?: string; + /** Deployment environment; on "cloud" a non-danger sandbox would panic, so we skip the override. */ + private environment?: "local" | "cloud"; + private readonly mcp = new McpManager(); + private readonly turns = new TurnController(); + private readonly usage = new UsageTracker(); constructor( client: AgentSideConnection, @@ -89,8 +151,12 @@ export class CodexAppServerAgent extends BaseAcpAgent { this.logger = options.logger ?? new Logger({ debug: true, prefix: "[CodexAppServerAgent]" }); - this.model = options.model ?? DEFAULT_CODEX_MODEL; - this.reasoningEffort = options.reasoningEffort; + this.config = new SessionConfigState( + options.model ?? DEFAULT_CODEX_MODEL, + options.reasoningEffort, + ); + this.onStructuredOutput = options.onStructuredOutput; + this.developerInstructions = options.processOptions.developerInstructions; const handlers: AppServerClientHandlers = { logger: this.logger, @@ -134,83 +200,467 @@ export class CodexAppServerAgent extends BaseAcpAgent { title: "PostHog Code", version: "0.1.0", }, - capabilities: { experimentalApi: false }, + // Opt into codex's experimental API so experimental turn/start fields are honored. + capabilities: { experimentalApi: true, requestAttestation: false }, }); this.rpc.notify(APP_SERVER_NOTIFICATIONS.INITIALIZED, {}); return { protocolVersion: request.protocolVersion, + agentCapabilities: { + promptCapabilities: { + image: true, + embeddedContext: true, + }, + // Only http: we don't claim SSE rather than mistranslate it into the http shape. + mcpCapabilities: { + http: true, + }, + loadSession: true, + sessionCapabilities: { + list: {}, + fork: {}, + resume: {}, + additionalDirectories: {}, + }, + _meta: { + posthog: { + resumeSession: true, + steering: "native", + }, + }, + }, agentInfo: { name: "codex", title: "Codex (app-server)", version: "0.1.0", }, + authMethods: [], }; } async newSession(params: NewSessionRequest): Promise { - const result = await this.rpc.request<{ thread?: { id?: string } }>( + const { threadId } = await this.setupThread( APP_SERVER_METHODS.THREAD_START, - { model: this.model, cwd: params.cwd }, + { + cwd: params.cwd, + mcpServers: params.mcpServers, + meta: params._meta as AppServerSessionMeta | undefined, + additionalDirectories: params.additionalDirectories ?? undefined, + }, + ); + return { sessionId: threadId, configOptions: this.config.options }; + } + + async resumeSession( + params: ResumeSessionRequest, + ): Promise { + await this.setupThread(APP_SERVER_METHODS.THREAD_RESUME, { + cwd: params.cwd, + mcpServers: params.mcpServers, + meta: params._meta as AppServerSessionMeta | undefined, + threadId: params.sessionId, + additionalDirectories: params.additionalDirectories ?? undefined, + }); + return { configOptions: this.config.options }; + } + + /** Re-attach to an existing thread without starting a turn: resume it, then replay the transcript. */ + async loadSession(params: LoadSessionRequest): Promise { + const { thread } = await this.setupThread( + APP_SERVER_METHODS.THREAD_RESUME, + { + cwd: params.cwd, + mcpServers: params.mcpServers, + meta: params._meta as AppServerSessionMeta | undefined, + threadId: params.sessionId, + additionalDirectories: params.additionalDirectories ?? undefined, + }, + ); + this.replayHistory(thread); + return { configOptions: this.config.options }; + } + + async unstable_forkSession( + params: ForkSessionRequest, + ): Promise { + const { threadId } = await this.setupThread( + APP_SERVER_METHODS.THREAD_FORK, + { + cwd: params.cwd, + mcpServers: params.mcpServers, + meta: params._meta as AppServerSessionMeta | undefined, + threadId: params.sessionId, + additionalDirectories: params.additionalDirectories ?? undefined, + }, + ); + return { sessionId: threadId, configOptions: this.config.options }; + } + + /** Replay a resumed thread's persisted turns (from the thread/resume response) as session updates. */ + private replayHistory(thread: AppServerThread | undefined): void { + if (!this.sessionId || !thread?.turns?.length) return; + for (const turn of thread.turns) { + for (const item of turn.items ?? []) { + for (const update of mapHistoryItem(this.sessionId, item)) { + void this.client.sessionUpdate(update).catch(() => undefined); + } + } + } + } + + async listSessions( + params: ListSessionsRequest, + ): Promise { + try { + const res = await this.rpc.request<{ + data?: Array<{ + id?: string; + cwd?: string; + name?: string | null; + preview?: string; + }>; + }>(APP_SERVER_METHODS.THREAD_LIST, { cwd: params.cwd }); + const sessions = (res?.data ?? []) + .filter((t) => t?.id) + .map((t) => ({ + sessionId: t.id as string, + cwd: t.cwd ?? params.cwd ?? "", + ...(t.name || t.preview + ? { title: t.name ?? t.preview ?? undefined } + : {}), + })); + return { sessions }; + } catch (err) { + this.logger.warn("thread/list failed", { error: String(err) }); + return { sessions: [] }; + } + } + + /** Shared thread setup for start/resume/fork. `threadId` present => resume/fork; absent => new thread. */ + private async setupThread( + method: string, + params: { + cwd?: string; + mcpServers?: NewSessionRequest["mcpServers"]; + meta?: AppServerSessionMeta; + threadId?: string; + additionalDirectories?: string[]; + }, + ): Promise<{ threadId: string; thread: AppServerThread | undefined }> { + this.jsonSchema = params.meta?.jsonSchema ?? undefined; + this.taskRunId = params.meta?.taskRunId; + this.environment = params.meta?.environment; + this.config.setInitialMode(params.meta?.permissionMode); + // Codex doesn't attribute input tokens by source; the baseline seeds the resident floor + system prompt. + this.usage.setBaseline(buildBaseline(params.meta)); + // Flatten the {append} form (else "[object Object]") and dedupe identical parts + // (the host pre-flattens into developerInstructions, so the prod prompt would duplicate). + const developerInstructions = [ + ...new Set( + [ + this.developerInstructions, + flattenSystemPrompt(params.meta?.systemPrompt), + ].filter((s): s is string => !!s), + ), + ].join("\n\n"); + // Degrade gracefully: an unresolvable bundled local-tools script skips it with a + // warning rather than killing thread setup. + let localTools: ReturnType = null; + try { + localTools = buildLocalToolsServer( + { cwd: params.cwd }, + this.localToolsMeta(params.meta), + ); + } catch (err) { + this.logger.warn( + "local-tools server unavailable; continuing without it", + { error: String(err) }, + ); + } + const mcpServers = toCodexMcpServers([ + ...(params.mcpServers ?? []), + ...(localTools ? [localTools] : []), + ]); + const config = buildThreadConfig(mcpServers, params.additionalDirectories); + + const result = await this.rpc.request<{ thread?: AppServerThread }>( + method, + { + model: this.config.model, + cwd: params.cwd, + ...(params.threadId ? { threadId: params.threadId } : {}), + ...(developerInstructions ? { developerInstructions } : {}), + ...(config ? { config } : {}), + }, ); - const threadId = result?.thread?.id; + const thread = result?.thread; + const threadId = thread?.id ?? params.threadId; if (!threadId) { - throw new Error("codex app-server thread/start returned no thread id"); + throw new Error(`codex app-server ${method} returned no thread id`); } this.threadId = threadId; this.sessionId = threadId; - this.logger.info("Codex app-server session created", { threadId }); - return { sessionId: threadId }; + await this.loadModelConfig(); + this.emitConfigOptions(); + await this.emitAvailableCommands(); + await this.emitSdkSession(); + this.logger.info("Codex app-server thread ready", { + method, + threadId, + mcpServers: mcpServers ? Object.keys(mcpServers) : [], + hasOutputSchema: !!this.jsonSchema, + hasLocalTools: !!localTools, + }); + return { threadId, thread }; + } + + private localToolsMeta( + meta: AppServerSessionMeta | undefined, + ): LocalToolsMeta | undefined { + if (!meta) return undefined; + return { + environment: meta.environment, + channelMode: meta.channelMode, + taskId: meta.taskId, + persistence: meta.persistence, + baseBranch: meta.baseBranch, + }; + } + + private async emitSdkSession(): Promise { + if (!this.taskRunId || !this.sessionId) return; + await this.client + .extNotification( + POSTHOG_NOTIFICATIONS.SDK_SESSION, + buildSdkSessionParams( + this.sessionId, + this.taskRunId, + ) as unknown as Record, + ) + .catch((err) => + this.logger.warn("sdk_session extNotification failed", err), + ); + } + + async setSessionConfigOption( + params: SetSessionConfigOptionRequest, + ): Promise { + const { configId } = params as { configId?: string }; + const value = (params as { value?: unknown }).value; + const { modeChanged } = this.config.setOption(configId, value); + // collaborationMode rides the next turn/start, so a mode switch only needs current_mode_update here. + if (modeChanged) this.emitCurrentMode(this.config.mode); + this.emitConfigOptions(); + return { configOptions: this.config.options }; + } + + /** codex-acp emits current_mode_update on mode change; mirror it for the host's mode cache. */ + private emitCurrentMode(modeId: string): void { + if (!this.sessionId) return; + void this.client + .sessionUpdate({ + sessionId: this.sessionId, + update: { sessionUpdate: "current_mode_update", currentModeId: modeId }, + } as unknown as Parameters[0]) + .catch(() => undefined); + } + + private async loadModelConfig(): Promise { + try { + const res = await this.rpc.request<{ data?: any[] }>( + APP_SERVER_METHODS.MODEL_LIST, + {}, + ); + this.config.loadModels(res?.data ?? []); + } catch (err) { + this.logger.warn("model/list failed; using current model only", { + error: String(err), + }); + this.config.clearModels(); + } + } + + private emitConfigOptions(): void { + if (!this.sessionId) return; + void this.client + .sessionUpdate({ + sessionId: this.sessionId, + update: { + sessionUpdate: "config_option_update", + configOptions: this.config.options, + }, + } as unknown as Parameters[0]) + .catch((err) => this.logger.warn("config_option_update failed", err)); + } + + /** skills/list → available_commands_update so the slash-command menu fills. */ + private async emitAvailableCommands(): Promise { + if (!this.sessionId) return; + let commands: Array<{ name: string; description: string }> = []; + try { + const res = await this.rpc.request<{ data?: Array<{ skills?: any[] }> }>( + APP_SERVER_METHODS.SKILLS_LIST, + {}, + ); + commands = (res?.data ?? []) + .flatMap((entry) => entry?.skills ?? []) + // Drop explicitly-disabled skills; lenient `!== false` so a malformed payload still shows. + .filter((s) => s?.name && s?.enabled !== false) + .map((s: any) => ({ name: s.name, description: s.description ?? "" })); + } catch (err) { + this.logger.warn("skills/list failed", { error: String(err) }); + } + void this.client + .sessionUpdate({ + sessionId: this.sessionId, + update: { + sessionUpdate: "available_commands_update", + availableCommands: commands, + }, + } as unknown as Parameters[0]) + .catch(() => undefined); } async prompt(params: PromptRequest): Promise { if (!this.threadId) { throw new Error("prompt() called before newSession()"); } - if (this.pendingTurn) { - // The host serializes turns; a concurrent prompt would clobber the - // single pendingTurn slot, so fail fast rather than corrupt it. - throw new Error("prompt() called while a turn is already in progress"); - } + // Reopen the notification gate (a prior interrupt may have left session.cancelled set). this.session.cancelled = false; - const input = toTurnInput(params.prompt); - const dropped = params.prompt.length - input.length; + // Prepend _meta.prContext (host PR-follow-up / Slack runs) to the FORWARDED prompt, + // else codex cloud follow-ups lose the PR-review context. The echo omits it. + const prContext = (params._meta as { prContext?: unknown } | undefined) + ?.prContext; + const promptBlocks = + typeof prContext === "string" && prContext.length > 0 + ? [{ type: "text" as const, text: prContext }, ...params.prompt] + : params.prompt; + const input = toCodexInput(promptBlocks); + if (input.length === 0) { + // turn/start rejects empty input, so end the turn cleanly. + this.logger.warn("prompt() had no usable input blocks; ending turn"); + return { stopReason: "end_turn" }; + } + // Count by type (not input.length): a resource block can fan out to multiple blocks. + const dropped = params.prompt.filter( + (b) => + b.type !== "text" && + b.type !== "image" && + b.type !== "resource" && + b.type !== "resource_link", + ).length; if (dropped > 0) { - this.logger.warn("Dropped non-text prompt blocks", { dropped }); + this.logger.warn("Dropped non-text/non-image prompt blocks", { dropped }); } - const completion = new Promise((resolve, reject) => { - this.pendingTurn = { resolve, reject }; - }); + // Echo the user prompt (codex emits none), for fresh turns and steering alike. + this.broadcastUserInput(params.prompt); + + if (this.turns.isRunning) { + // A turn is already running: fold the message in via turn/steer (precondition: the + // active turnId). Refresh from the response's rotated turnId so a later steer/interrupt + // still targets the live turn (no turn/started is re-emitted for a steer). + const steerRes = await this.rpc + .request<{ turnId?: string }>(APP_SERVER_METHODS.TURN_STEER, { + threadId: this.threadId, + input, + expectedTurnId: this.turns.activeTurnId, + }) + .catch((err) => { + this.logger.warn("turn/steer failed", err); + return undefined; + }); + this.turns.onSteered(steerRes?.turnId); + return { stopReason: await this.turns.awaitCompletion() }; + } + if (this.turns.isPending) { + // A turn is pending but has no turnId yet, so we can't steer; fail fast. + throw new Error("prompt() called while a turn is already in progress"); + } + + this.lastAgentMessage = ""; + this.resetUsage(); + const completion = this.turns.begin(); try { + const approvalPolicy = this.config.approvalPolicy(); + const sandboxPolicy = this.config.sandboxPolicy(); + const activePermissionProfile = this.config.permissionProfile(); await this.rpc.request(APP_SERVER_METHODS.TURN_START, { threadId: this.threadId, input, - ...(this.reasoningEffort ? { effort: this.reasoningEffort } : {}), + model: this.config.model, + ...(this.config.effort ? { effort: this.config.effort } : {}), + // Always request a reasoning summary; the default "auto" can skip it on trivial turns. + summary: "detailed", + // Picker preset applied per-turn. Skipped on cloud, where a non-danger sandbox + // re-engages the unavailable linux-sandbox and panics. + ...(approvalPolicy ? { approvalPolicy } : {}), + // Pushed every turn — codex remembers the last mode, so switching back from plan must be explicit. + collaborationMode: this.config.collaborationModeForTurn(), + ...(this.environment !== "cloud" && sandboxPolicy + ? { sandboxPolicy } + : {}), + // codex 0.140.0 enforces the sandbox via named profiles; sandboxPolicy alone is no + // longer honored, so plan/read-only also send this. Same cloud gating. + ...(this.environment !== "cloud" && activePermissionProfile + ? { activePermissionProfile } + : {}), + // Constrain the final message to the task schema for parseable structured output. + ...(this.jsonSchema ? { outputSchema: this.jsonSchema } : {}), }); return { stopReason: await completion }; } finally { - this.pendingTurn = undefined; + this.turns.finishPrompt(); + } + } + + /** Echo each user prompt block (text + image, so an image-only turn still renders) for the host log/UI. */ + private broadcastUserInput(prompt: PromptRequest["prompt"]): void { + if (!this.sessionId) return; + for (const block of prompt) { + if (block.type !== "text" && block.type !== "image") continue; + void this.client + .sessionUpdate({ + sessionId: this.sessionId, + update: { + sessionUpdate: "user_message_chunk", + content: block, + }, + }) + .catch(() => undefined); } } + private resetUsage(): void { + this.usage.resetForTurn(); + } + protected async interrupt(): Promise { - // Tell the server to stop first, then report the turn cancelled, so the - // caller never sees "cancelled" while Codex is still running. - if (this.threadId) { + // Stop the server, then finalize through the shared path so a cancelled turn still emits + // the cloud idle signal (finalizeTurn claims idempotently). turn/interrupt requires BOTH + // threadId and turnId (else -32600); skip the RPC when no turn started. + const turnId = this.turns.markInterrupted(); + if (this.threadId && turnId) { await this.rpc - .request(APP_SERVER_METHODS.TURN_INTERRUPT, { threadId: this.threadId }) + .request(APP_SERVER_METHODS.TURN_INTERRUPT, { + threadId: this.threadId, + turnId, + }) .catch((err) => this.logger.warn("turn/interrupt failed", err)); } - this.pendingTurn?.resolve("cancelled"); - this.pendingTurn = undefined; + await this.finalizeTurn("cancelled"); } async closeSession(): Promise { this.session.abortController.abort(); - this.pendingTurn?.resolve("cancelled"); - this.pendingTurn = undefined; + this.turns.close("cancelled"); this.session.settingsManager.dispose(); + // Close the transport BEFORE kill() destroys the stdio streams (else close() blocks on + // an ack that never arrives). Bounded so cleanup can't hang the caller. + await Promise.race([ + this.rpc.close().catch(() => undefined), + new Promise((resolve) => setTimeout(resolve, 2000)), + ]); this.proc?.kill(); - await this.rpc.close(); } private handleNotification(method: string, params: unknown): void { @@ -228,71 +678,421 @@ export class CodexAppServerAgent extends BaseAcpAgent { } } + if (method === APP_SERVER_NOTIFICATIONS.TURN_STARTED) { + // Capture the active turn id (steer precondition / interrupt target). + this.turns.onStarted((params as { turn?: { id?: string } })?.turn?.id); + } + + if ( + method === APP_SERVER_NOTIFICATIONS.ITEM_STARTED || + method === APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED + ) { + this.mcp.capture(params); + } + + // codex auto-compaction surfaces as a contextCompaction item: item/started → in progress, + // item/completed → boundary (codex emits no separate thread/compacted; that's a guarded + // fallback). compactionActive dedupes to one boundary per compaction. + const isCompactionItem = + (params as { item?: { type?: string } })?.item?.type === + "contextCompaction"; + if ( + method === APP_SERVER_NOTIFICATIONS.ITEM_STARTED && + isCompactionItem && + !this.compactionActive + ) { + this.compactionActive = true; + this.emitCompactionStarted(); + } + if ( + this.compactionActive && + ((method === APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED && + isCompactionItem) || + method === APP_SERVER_NOTIFICATIONS.CONTEXT_COMPACTED) + ) { + this.compactionActive = false; + this.emitCompactionBoundary(); + } + + if (method === APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED) { + this.captureAgentMessage(params); + } + + if (method === APP_SERVER_NOTIFICATIONS.TOKEN_USAGE_UPDATED) { + this.emitUsageExtNotification(params); + } + if (method === APP_SERVER_NOTIFICATIONS.TURN_COMPLETED) { - const status = (params as { turn?: { status?: string } })?.turn?.status; - this.pendingTurn?.resolve(status === "failed" ? "refusal" : "end_turn"); - this.pendingTurn = undefined; + const turn = (params as { turn?: { id?: string; status?: string } }) + ?.turn; + // Drop the late completion of an already-interrupted turn (else it cancels the follow-up). + if (this.turns.shouldDropCompletion(turn?.id)) return; + void this.finalizeTurn(mapTurnStopReason(turn?.status)); + } + + if (method === APP_SERVER_NOTIFICATIONS.ERROR) { + // A non-retried fatal error: resolve the turn so prompt() returns rather than hangs. + const willRetry = (params as { willRetry?: boolean })?.willRetry; + if (willRetry === false) { + this.logger.warn("codex app-server fatal error notification", { + params, + }); + void this.finalizeTurn("refusal"); + } + } + } + + /** Track the latest assistant message so the final one feeds structured output. */ + private captureAgentMessage(params: unknown): void { + const item = (params as { item?: { type?: string; text?: string } })?.item; + if (item?.type === "agentMessage" && typeof item.text === "string") { + this.lastAgentMessage = item.text; + } + } + + /** Compaction started: emit `_posthog/status` so the host sets `isCompacting` (gates steer/queue). */ + private emitCompactionStarted(): void { + if (!this.sessionId) return; + void this.client + .extNotification(POSTHOG_NOTIFICATIONS.STATUS, { + sessionId: this.sessionId, + status: "compacting", + }) + .catch(() => undefined); + } + + /** Compaction finished: emit `_posthog/compact_boundary` (host clears isCompacting) + a transcript marker. */ + private emitCompactionBoundary(): void { + if (!this.sessionId) return; + void this.client + .extNotification(POSTHOG_NOTIFICATIONS.COMPACT_BOUNDARY, { + sessionId: this.sessionId, + }) + .catch(() => undefined); + void this.client + .sessionUpdate({ + sessionId: this.sessionId, + update: { + sessionUpdate: "agent_message_chunk", + content: { type: "text", text: "\n\nContext compacted." }, + }, + }) + .catch(() => undefined); + } + + /** Mirror codex-acp's `_posthog/usage_update` so the host's token/cost UI fills. */ + private emitUsageExtNotification(params: unknown): void { + if (!this.sessionId) return; + const update = this.usage.ingest(params); + if (!update) return; + void this.client + .extNotification(POSTHOG_NOTIFICATIONS.USAGE_UPDATE, { + sessionId: this.sessionId, + ...update, + }) + .catch((err) => this.logger.warn("usage extNotification failed", err)); + } + + /** Deliver structured output (parsed from the final message) before resolving the turn. */ + private async finalizeTurn(reason: StopReason): Promise { + // Idempotent: claim synchronously (before any await) so a second finalize (e.g. an + // error racing turn/completed) is a no-op and callbacks don't double-fire. + const pending = this.turns.claim(); + if (!pending) return; + // If the turn dies mid-compaction the boundary never fires, leaving isCompacting stuck + // true (silently queuing later messages). Recover here. + if (this.compactionActive) { + this.compactionActive = false; + this.emitCompactionBoundary(); + } + const message = this.lastAgentMessage; + // Per-turn usage is codex's own `tokenUsage.last` (not a reconstructed delta). + const usage = this.usage.perTurnUsage(); + const contextUsed = this.usage.contextTokens(); + + // Deliver structured output only on a clean end_turn — a cancelled/refused turn records nothing. + if ( + reason === "end_turn" && + this.jsonSchema && + this.onStructuredOutput && + message + ) { + const parsed = parseStructuredOutput(message); + if (parsed) { + try { + await this.onStructuredOutput(parsed); + } catch (err) { + this.logger.warn("onStructuredOutput callback threw", { error: err }); + } + } else { + this.logger.warn( + "Could not parse structured output from final message", + { + preview: message.slice(0, 200), + }, + ); + } + } + await this.emitTurnComplete(reason, usage, contextUsed); + pending.resolve(reason); + } + + /** Emit cloud per-turn notifications: `_posthog/turn_complete` (only with a taskRunId) + the usage breakdown (always). */ + private async emitTurnComplete( + reason: StopReason, + usage: AccumulatedUsage, + contextUsed: number | undefined, + ): Promise { + if (!this.sessionId) return; + if (this.taskRunId) { + await this.client + .extNotification( + POSTHOG_NOTIFICATIONS.TURN_COMPLETE, + buildTurnCompleteParams( + this.sessionId, + reason, + usage, + ) as unknown as Record, + ) + .catch((err) => + this.logger.warn("turn_complete extNotification failed", err), + ); + } + if (contextUsed !== undefined) { + await this.client + .extNotification( + POSTHOG_NOTIFICATIONS.USAGE_UPDATE, + buildUsageBreakdownParams( + this.sessionId, + this.usage.baselineBreakdown, + contextUsed, + ) as unknown as Record, + ) + .catch((err) => + this.logger.warn("usage breakdown extNotification failed", err), + ); } } private handleServerClosed(): void { - this.pendingTurn?.reject( + this.turns.fail( new Error("codex app-server exited before the turn completed"), ); - this.pendingTurn = undefined; } + /** + * Server-initiated requests. Simple approvals resolve to a `{ decision }` envelope (a bare + * string is rejected); richer ones (AskUserQuestion / permission profile / elicitation) go + * to `handleServerRequest`. Whatever we return is sent back as the JSON-RPC result. + */ private async handleApproval( method: string, params: unknown, - ): Promise { + ): Promise { + const richer = await handleServerRequest(method, params, this.client, { + sessionId: this.sessionId, + logger: this.logger, + resolveMcpToolCall: (serverName) => this.mcp.byServer(serverName), + }); + if (richer.handled) { + return richer.response; + } if ( method !== APP_SERVER_REQUESTS.COMMAND_APPROVAL && method !== APP_SERVER_REQUESTS.FILE_CHANGE_APPROVAL ) { this.logger.warn("Unrecognized server request; declining", { method }); - return "decline"; + return { decision: "decline" }; } - const detail = params as { itemId?: string; command?: string }; + const isFileChange = method === APP_SERVER_REQUESTS.FILE_CHANGE_APPROVAL; + const detail = params as { + itemId?: string; + command?: string; + changes?: AppServerItem["changes"]; + available_decisions?: unknown; + }; + // codex tells us which decisions are valid here. When it offers an "approve and + // remember" decision (exec-policy allowlist / session approval), surface Allow-always. + const availableDecisions = Array.isArray(detail.available_decisions) + ? detail.available_decisions.filter( + (d): d is string => typeof d === "string", + ) + : []; + const rememberDecision = + availableDecisions.find((d) => d === "approved_execpolicy_amendment") ?? + availableDecisions.find((d) => d === "approved_for_session"); const title = - detail.command ?? - (method === APP_SERVER_REQUESTS.FILE_CHANGE_APPROVAL - ? "Apply file changes" - : "Run command"); + detail.command ?? (isFileChange ? "Apply file changes" : "Run command"); + const toolCallId = detail.itemId ?? "codex-approval"; + // Codex has no MCP-specific approval; a known MCP call surfaces the real server/tool/args + // so the host renders the proper MCP permission (incl. PostHog `exec` unwrapping). + const mcp = this.mcp.byItemId(detail.itemId); + // kind + content route plain command/file approvals to Execute/EditPermission (not the fallback). + const toolCall = mcp + ? { + toolCallId, + title, + kind: "other" as const, + rawInput: mcp.args, + _meta: posthogToolMeta({ + toolName: mcpToolKey({ server: mcp.server, tool: mcp.tool }), + mcp: { server: mcp.server, tool: mcp.tool }, + }), + } + : isFileChange + ? { + toolCallId, + title, + kind: "edit" as const, + content: diffContent(detail.changes), + locations: changePaths(detail.changes).map((path) => ({ path })), + } + : { + toolCallId, + title, + kind: "execute" as const, + content: detail.command + ? [ + { + type: "content" as const, + content: { type: "text" as const, text: detail.command }, + }, + ] + : undefined, + }; try { const response = await this.client.requestPermission({ sessionId: this.sessionId, - toolCall: { toolCallId: detail.itemId ?? "codex-approval", title }, + toolCall, options: [ { optionId: "allow", name: "Allow", kind: "allow_once" }, + ...(rememberDecision + ? [ + { + optionId: "allow_always", + name: isFileChange + ? "Allow for the rest of this session" + : "Allow and don't ask again", + kind: "allow_always" as const, + }, + ] + : []), { optionId: "reject", name: "Reject", kind: "reject_once" }, + { + optionId: "reject_with_feedback", + name: "No, and tell Codex what to do differently", + kind: "reject_once", + _meta: { customInput: true }, + }, ], }); - if ( - response.outcome.outcome === "selected" && - response.outcome.optionId === "allow" - ) { - return "accept"; + if (response.outcome.outcome === "selected") { + if (response.outcome.optionId === "allow_always" && rememberDecision) { + // Echo codex's "approve and remember" decision so it applies the proposed amendment. + return { decision: rememberDecision }; + } + if (response.outcome.optionId === "allow") { + return { decision: "accept" }; + } + if (response.outcome.optionId === "reject_with_feedback") { + // codex's response has no feedback field, so decline and inject the guidance + // into the running turn (as its TUI does: Denied + a follow-up message). + const feedback = (response as { _meta?: { customInput?: unknown } }) + ._meta?.customInput; + const activeTurnId = this.turns.activeTurnId; + if (typeof feedback === "string" && feedback.trim() && activeTurnId) { + void this.rpc + .request<{ turnId?: string }>(APP_SERVER_METHODS.TURN_STEER, { + threadId: this.threadId, + input: toCodexInput([{ type: "text", text: feedback.trim() }]), + expectedTurnId: activeTurnId, + }) + // codex rotates the turn id on steer; adopt it or later + // interrupts/steers target a dead turn. + .then((res) => this.turns.onSteered(res?.turnId)) + .catch((err) => + this.logger.warn("turn/steer (reject feedback) failed", err), + ); + } + return { decision: "decline" }; + } } if (response.outcome.outcome === "cancelled") { - return "cancel"; + return { decision: "cancel" }; } - return "decline"; + return { decision: "decline" }; } catch (err) { this.logger.warn("requestPermission failed; declining", err); - return "decline"; + return { decision: "decline" }; } } } -function toTurnInput( - prompt: ContentBlock[], -): Array<{ type: "text"; text: string }> { - const input: Array<{ type: "text"; text: string }> = []; - for (const block of prompt) { - if (block.type === "text") { - input.push({ type: "text", text: block.text }); +// BASELINE_TOKENS from codex-rs protocol.rs — the resident floor we can't attribute per-source. +const CODEX_BASELINE_TOKENS = 12000; + +/** codex `TurnStatus` → ACP `StopReason`: interrupted → cancel, failed → refusal, else end. */ +function mapTurnStopReason(status: string | undefined): StopReason { + if (status === "interrupted") return "cancelled"; + if (status === "failed") return "refusal"; + return "end_turn"; +} + +/** The codex thread config override map: folds in MCP servers + makes extra workspace roots writable. Undefined when empty. */ +function buildThreadConfig( + mcpServers: ReturnType, + additionalDirectories: string[] | undefined, +): Record | undefined { + const config: Record = {}; + if (mcpServers) { + config.mcp_servers = mcpServers; + } + if (additionalDirectories?.length) { + config.sandbox_workspace_write = { writable_roots: additionalDirectories }; + } + return Object.keys(config).length > 0 ? config : undefined; +} + +/** Seed the context-breakdown baseline with the resident floor + the host's system prompt. */ +function buildBaseline( + meta: AppServerSessionMeta | undefined, +): ContextBreakdownBaseline { + const baseline = emptyBaseline(); + baseline.systemPrompt = + CODEX_BASELINE_TOKENS + + estimateTokens(flattenSystemPrompt(meta?.systemPrompt)); + return baseline; +} + +/** Flatten the host's systemPrompt (`string | { append }`) to a string (else "[object Object]"). */ +function flattenSystemPrompt( + systemPrompt: string | { append?: string } | undefined, +): string | undefined { + if (typeof systemPrompt === "string") return systemPrompt || undefined; + if (systemPrompt && typeof systemPrompt.append === "string") { + return systemPrompt.append || undefined; + } + return undefined; +} + +/** Parse structured output from the final message, defensively (fenced block / first object). */ +function parseStructuredOutput(text: string): Record | null { + const trimmed = text.trim(); + const candidates = [trimmed]; + const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/); + if (fenced) candidates.push(fenced[1].trim()); + const brace = trimmed.match(/\{[\s\S]*\}/); + if (brace) candidates.push(brace[0]); + + for (const candidate of candidates) { + try { + const parsed: unknown = JSON.parse(candidate); + if (parsed && typeof parsed === "object" && !Array.isArray(parsed)) { + return parsed as Record; + } + } catch { + // Try the next candidate. } } - return input; + return null; } diff --git a/packages/agent/src/adapters/codex-app-server/ext-notifications.test.ts b/packages/agent/src/adapters/codex-app-server/ext-notifications.test.ts new file mode 100644 index 0000000000..93538ee006 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/ext-notifications.test.ts @@ -0,0 +1,74 @@ +import { describe, expect, it } from "vitest"; +import { emptyBaseline } from "../claude/context-breakdown"; +import { + buildSdkSessionParams, + buildTurnCompleteParams, + buildUsageBreakdownParams, +} from "./ext-notifications"; + +describe("ext-notifications builders", () => { + it("buildSdkSessionParams tags the codex adapter so resume keys on the family", () => { + expect(buildSdkSessionParams("sess-1", "run-42")).toEqual({ + taskRunId: "run-42", + sessionId: "sess-1", + adapter: "codex", + }); + }); + + it("buildTurnCompleteParams derives totalTokens from all four counts", () => { + const params = buildTurnCompleteParams("sess-1", "end_turn", { + inputTokens: 100, + outputTokens: 20, + cachedReadTokens: 5, + cachedWriteTokens: 3, + }); + + expect(params).toEqual({ + sessionId: "sess-1", + stopReason: "end_turn", + usage: { + inputTokens: 100, + outputTokens: 20, + cachedReadTokens: 5, + cachedWriteTokens: 3, + totalTokens: 128, + }, + }); + }); + + it("buildTurnCompleteParams forwards non-default stop reasons", () => { + expect( + buildTurnCompleteParams("sess-1", "refusal", { + inputTokens: 0, + outputTokens: 0, + cachedReadTokens: 0, + cachedWriteTokens: 0, + }).stopReason, + ).toBe("refusal"); + }); + + it("buildUsageBreakdownParams attributes overflow above the baseline to conversation", () => { + const baseline = { ...emptyBaseline(), systemPrompt: 1000, tools: 500 }; + + expect(buildUsageBreakdownParams("sess-1", baseline, 2000)).toEqual({ + sessionId: "sess-1", + breakdown: { + systemPrompt: 1000, + tools: 500, + rules: 0, + skills: 0, + mcp: 0, + subagents: 0, + conversation: 500, + }, + }); + }); + + it("buildUsageBreakdownParams floors conversation at 0 when usage is below baseline", () => { + const baseline = { ...emptyBaseline(), systemPrompt: 1000 }; + + expect( + buildUsageBreakdownParams("sess-1", baseline, 200).breakdown.conversation, + ).toBe(0); + }); +}); diff --git a/packages/agent/src/adapters/codex-app-server/ext-notifications.ts b/packages/agent/src/adapters/codex-app-server/ext-notifications.ts new file mode 100644 index 0000000000..f6898c8fa2 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/ext-notifications.ts @@ -0,0 +1,106 @@ +/** + * Pure builders for the PostHog `_posthog/*` ext-notification params the app-server + * adapter emits, mirroring the codex-acp adapter so log consumers and the renderer + * see the same shapes. Param-only (no I/O) so each is unit-testable in isolation. + */ + +import type { StopReason } from "@agentclientprotocol/sdk"; +import { + buildBreakdown, + type ContextBreakdown, + type ContextBreakdownBaseline, +} from "../claude/context-breakdown"; + +/** + * Adapter tag on `_posthog/sdk_session`. Kept `"codex"` (not `"codex-app-server"`) + * so resume/keying treats both Codex transports as the same agent family. + */ +const CODEX_ADAPTER = "codex" as const; + +export interface SdkSessionParams { + taskRunId: string; + sessionId: string; + adapter: typeof CODEX_ADAPTER; +} + +/** `_posthog/sdk_session` — maps a taskRunId to the sessionId so the host can resume later. */ +export function buildSdkSessionParams( + sessionId: string, + taskRunId: string, +): SdkSessionParams { + return { + taskRunId, + sessionId, + adapter: CODEX_ADAPTER, + }; +} + +/** Per-turn token usage. `totalTokens` is derived so consumers don't re-sum. */ +export interface TurnCompleteUsage { + inputTokens: number; + outputTokens: number; + cachedReadTokens: number; + cachedWriteTokens: number; + totalTokens: number; +} + +export interface TurnCompleteParams { + sessionId: string; + stopReason: StopReason; + usage: TurnCompleteUsage; +} + +/** The four component counts the caller accumulates; total is computed here. */ +export interface AccumulatedUsage { + inputTokens: number; + outputTokens: number; + cachedReadTokens: number; + cachedWriteTokens: number; +} + +/** + * `_posthog/turn_complete` — fired when a prompt turn finishes. `totalTokens` is the + * sum of all four component counts, matching the codex-acp adapter. + */ +export function buildTurnCompleteParams( + sessionId: string, + stopReason: StopReason, + usage: AccumulatedUsage, +): TurnCompleteParams { + return { + sessionId, + stopReason, + usage: { + inputTokens: usage.inputTokens, + outputTokens: usage.outputTokens, + cachedReadTokens: usage.cachedReadTokens, + cachedWriteTokens: usage.cachedWriteTokens, + totalTokens: + usage.inputTokens + + usage.outputTokens + + usage.cachedReadTokens + + usage.cachedWriteTokens, + }, + }; +} + +export interface UsageBreakdownParams { + sessionId: string; + breakdown: ContextBreakdown; +} + +/** + * `_posthog/usage_update` (breakdown variant) — per-source context attribution. + * Codex doesn't attribute tokens by source, so we fold the baseline estimate with + * the live `contextUsed` via `buildBreakdown`. + */ +export function buildUsageBreakdownParams( + sessionId: string, + baseline: ContextBreakdownBaseline, + contextUsed: number, +): UsageBreakdownParams { + return { + sessionId, + breakdown: buildBreakdown(baseline, contextUsed), + }; +} diff --git a/packages/agent/src/adapters/codex-app-server/input.test.ts b/packages/agent/src/adapters/codex-app-server/input.test.ts new file mode 100644 index 0000000000..f63e6172ef --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/input.test.ts @@ -0,0 +1,129 @@ +import type { ContentBlock } from "@agentclientprotocol/sdk"; +import { describe, expect, it } from "vitest"; +import { toCodexInput } from "./input"; + +describe("toCodexInput", () => { + it("passes text blocks through with empty text_elements", () => { + const prompt: ContentBlock[] = [ + { type: "text", text: "hello" }, + { type: "text", text: "world" }, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { type: "text", text: "hello", text_elements: [] }, + { type: "text", text: "world", text_elements: [] }, + ]); + }); + + it("maps a base64 image block to the codex image variant as a data URL", () => { + const prompt: ContentBlock[] = [ + { type: "image", data: "AAAA", mimeType: "image/png" }, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { type: "image", url: "data:image/png;base64,AAAA" }, + ]); + }); + + it("maps an http(s) image URI to a remote image and file:// to localImage", () => { + const prompt: ContentBlock[] = [ + { + type: "image", + data: "", + mimeType: "image/png", + uri: "https://x/y.png", + }, + { + type: "image", + data: "", + mimeType: "image/png", + uri: "file:///tmp/pic.png", + }, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { type: "image", url: "https://x/y.png" }, + { type: "localImage", path: "/tmp/pic.png" }, + ]); + }); + + it("drops only audio and unusable images, keeping text", () => { + const prompt: ContentBlock[] = [ + { type: "text", text: "keep" }, + { type: "audio", data: "AAAA", mimeType: "audio/wav" }, + { type: "image", data: "", mimeType: "image/png", uri: "ftp://nope" }, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { type: "text", text: "keep", text_elements: [] }, + ]); + }); + + it("surfaces a file:// resource_link as its on-disk path", () => { + const prompt: ContentBlock[] = [ + { type: "resource_link", uri: "file:///repo/doc.md", name: "doc" }, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { + type: "text", + text: "Attached workspace file (read it from disk): /repo/doc.md", + text_elements: [], + }, + ]); + }); + + it("inlines a non-file resource's text as a trailing block", () => { + const prompt: ContentBlock[] = [ + { type: "text", text: "use the snippet" }, + { + type: "resource", + resource: { uri: "https://x/snippet", text: "const a = 1;" }, + }, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { type: "text", text: "use the snippet", text_elements: [] }, + { type: "text", text: "https://x/snippet", text_elements: [] }, + { + type: "text", + text: '\nconst a = 1;\n', + text_elements: [], + }, + ]); + }); + + it("omits the bare-uri text block for a resource with no uri", () => { + const prompt: ContentBlock[] = [ + { + type: "resource", + resource: { text: "inline snippet" }, + } as unknown as ContentBlock, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { + type: "text", + text: '\ninline snippet\n', + text_elements: [], + }, + ]); + }); + + it("surfaces a file:// resource as its path, not inline text", () => { + const prompt: ContentBlock[] = [ + { + type: "resource", + resource: { uri: "file:///repo/a.ts", text: "stale on-disk copy" }, + }, + ]; + + expect(toCodexInput(prompt)).toEqual([ + { + type: "text", + text: "Attached workspace file (read it from disk): /repo/a.ts", + text_elements: [], + }, + ]); + }); +}); diff --git a/packages/agent/src/adapters/codex-app-server/input.ts b/packages/agent/src/adapters/codex-app-server/input.ts new file mode 100644 index 0000000000..3992cbbbad --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/input.ts @@ -0,0 +1,103 @@ +import { fileURLToPath } from "node:url"; +import type { ContentBlock } from "@agentclientprotocol/sdk"; + +/** + * Codex app-server `UserInput`, narrowed to the three variants an ACP prompt + * can produce (`text`, remote `image`, `localImage`). + */ +export type CodexUserInput = + | { type: "text"; text: string; text_elements: [] } + | { type: "image"; url: string } + | { type: "localImage"; path: string }; + +function textInput(text: string): CodexUserInput { + return { type: "text", text, text_elements: [] }; +} + +/** A `file://` resource is surfaced as its path so codex reads it from disk. */ +function resourceLinkText(uri: string): string { + if (uri.startsWith("file://")) { + try { + return `Attached workspace file (read it from disk): ${fileURLToPath(uri)}`; + } catch { + return `Attached file: ${uri}`; + } + } + return `Attached resource: ${uri}`; +} + +/** + * Maps ACP prompt content blocks to codex app-server `UserInput[]`. Text passes through; + * images map to `image`/`localImage`; `file://` resources become path notes and non-file + * resource text is inlined as a trailing `` block. Audio/blob/malformed are dropped. + */ +export function toCodexInput(prompt: ContentBlock[]): CodexUserInput[] { + const input: CodexUserInput[] = []; + const context: string[] = []; + for (const block of prompt) { + if (block.type === "text") { + input.push(textInput(block.text)); + continue; + } + if (block.type === "image") { + const mapped = imageToCodexInput(block); + if (mapped) { + input.push(mapped); + } + continue; + } + if (block.type === "resource_link") { + input.push(textInput(resourceLinkText(block.uri))); + continue; + } + if (block.type === "resource" && "text" in block.resource) { + const uri = block.resource.uri ?? ""; + if (uri.startsWith("file://")) { + input.push(textInput(resourceLinkText(uri))); + continue; + } + if (uri) { + input.push(textInput(uri)); + } + context.push( + `\n${block.resource.text}\n`, + ); + } + } + if (context.length > 0) { + input.push(textInput(context.join("\n"))); + } + return input; +} + +/** + * Prefer inline base64 (as a data URL); else fall back to the `uri`: + * `http(s)` → remote `image`, `file://` → `localImage`. + */ +function imageToCodexInput(block: { + data: string; + mimeType: string; + uri?: string | null; +}): CodexUserInput | undefined { + if (block.data) { + return { + type: "image", + url: `data:${block.mimeType};base64,${block.data}`, + }; + } + const uri = block.uri; + if (!uri) { + return undefined; + } + if (uri.startsWith("http://") || uri.startsWith("https://")) { + return { type: "image", url: uri }; + } + if (uri.startsWith("file://")) { + try { + return { type: "localImage", path: fileURLToPath(uri) }; + } catch { + return undefined; + } + } + return undefined; +} diff --git a/packages/agent/src/adapters/codex-app-server/local-tools-mcp.test.ts b/packages/agent/src/adapters/codex-app-server/local-tools-mcp.test.ts new file mode 100644 index 0000000000..1a65e20dbd --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/local-tools-mcp.test.ts @@ -0,0 +1,100 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; +import { LOCAL_TOOLS_MCP_NAME } from "../local-tools"; +import { buildLocalToolsServer } from "./local-tools-mcp"; + +// The dist asset isn't on the walk-up path in unit tests, so make existsSync +// succeed; nothing spawns the script — we only inspect the path. +vi.mock("node:fs", async (importActual) => { + const actual = await importActual(); + return { ...actual, existsSync: vi.fn().mockReturnValue(true) }; +}); + +describe("buildLocalToolsServer", () => { + const saved = { + sandbox: process.env.IS_SANDBOX, + ghToken: process.env.GH_TOKEN, + githubToken: process.env.GITHUB_TOKEN, + }; + + beforeEach(() => { + // The signed-git gate reads IS_SANDBOX and the token vars; clear them so each + // case controls the cloud signal (meta.environment) and token explicitly. + delete process.env.IS_SANDBOX; + delete process.env.GH_TOKEN; + delete process.env.GITHUB_TOKEN; + }); + + afterEach(() => { + restore("IS_SANDBOX", saved.sandbox); + restore("GH_TOKEN", saved.ghToken); + restore("GITHUB_TOKEN", saved.githubToken); + }); + + function restore(key: string, value: string | undefined): void { + if (value === undefined) { + delete process.env[key]; + } else { + process.env[key] = value; + } + } + + it("returns a stdio server config with command/args/env on a cloud run with a token", () => { + process.env.GH_TOKEN = "ghs_test"; + + const server = buildLocalToolsServer( + { cwd: "/repo" }, + { environment: "cloud" }, + ); + + expect(server).not.toBeNull(); + expect(server?.name).toBe(LOCAL_TOOLS_MCP_NAME); + expect(server?.command).toBe(process.execPath); + expect(server?.args).toHaveLength(1); + expect(server?.args[0]).toMatch(/local-tools-mcp-server\.js$/); + + const envNames = server?.env.map((e) => e.name) ?? []; + expect(envNames).toContain("POSTHOG_LOCAL_TOOLS_CTX"); + expect(envNames).toContain("POSTHOG_LOCAL_TOOLS_ENABLED"); + // Token is forwarded to the child so its own git remote ops authenticate. + expect(envNames).toContain("GH_TOKEN"); + expect(envNames).toContain("GITHUB_TOKEN"); + + const ctxEntry = server?.env.find( + (e) => e.name === "POSTHOG_LOCAL_TOOLS_CTX", + ); + const ctx = JSON.parse( + Buffer.from(ctxEntry?.value ?? "", "base64").toString("utf-8"), + ); + expect(ctx.cwd).toBe("/repo"); + expect(ctx.token).toBe("ghs_test"); + }); + + it("returns a server but omits token env vars when no token is present", () => { + const server = buildLocalToolsServer( + { cwd: "/repo" }, + { environment: "cloud" }, + ); + + expect(server).not.toBeNull(); + const envNames = server?.env.map((e) => e.name) ?? []; + expect(envNames).toContain("POSTHOG_LOCAL_TOOLS_CTX"); + expect(envNames).not.toContain("GH_TOKEN"); + expect(envNames).not.toContain("GITHUB_TOKEN"); + }); + + it("returns null when no cwd is present", () => { + process.env.GH_TOKEN = "ghs_test"; + + expect( + buildLocalToolsServer({ cwd: undefined }, { environment: "cloud" }), + ).toBeNull(); + }); + + it("returns null when no tool's gate passes (desktop run)", () => { + process.env.GH_TOKEN = "ghs_test"; + + expect( + buildLocalToolsServer({ cwd: "/repo" }, { environment: "local" }), + ).toBeNull(); + }); +}); diff --git a/packages/agent/src/adapters/codex-app-server/local-tools-mcp.ts b/packages/agent/src/adapters/codex-app-server/local-tools-mcp.ts new file mode 100644 index 0000000000..e7f0976f59 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/local-tools-mcp.ts @@ -0,0 +1,104 @@ +/** + * Builds the stdio local-tools MCP server config to inject into a Codex + * app-server thread's `config.mcp_servers`, ported from the codex-acp adapter. + * Returns the ACP `McpServerStdio` shape so the existing translation layer stays + * the single owner of the ACP→Codex map. + */ + +import { existsSync } from "node:fs"; +import { resolve as resolvePath } from "node:path"; +import type { McpServerStdio } from "@agentclientprotocol/sdk"; +import { ghTokenEnv } from "@posthog/git/signed-commit"; +import { resolveGithubToken } from "../../utils/github-token"; +import { + enabledLocalTools, + LOCAL_TOOLS_MCP_NAME, + type LocalToolCtx, + type LocalToolGateMeta, +} from "../local-tools"; +import { resolveTaskId } from "../session-meta"; + +/** + * Gate inputs the local-tools server needs beyond `LocalToolGateMeta`: the task id + * and the base branch the signed-git tools default to. Self-contained so this + * module doesn't depend on the hub agent's session-meta type. + */ +export interface LocalToolsMeta extends LocalToolGateMeta { + taskId?: string; + persistence?: { taskId?: string }; + baseBranch?: string; +} + +/** + * Resolve a shared dist asset by walking up from the compiled adapter location — + * its depth varies across bundle entry points. Mirrors the codex-acp adapter. + */ +function resolveBundledMcpScript(rel: string): string { + let dir = import.meta.dirname ?? __dirname; + for (let i = 0; i < 5; i++) { + const candidate = resolvePath(dir, rel); + if (existsSync(candidate)) return candidate; + dir = resolvePath(dir, ".."); + } + throw new Error( + `Could not locate ${rel} relative to ${import.meta.dirname ?? __dirname}.`, + ); +} + +function toMcpServerStdio( + ctx: LocalToolCtx, + enabledNames: string[], +): McpServerStdio { + const scriptPath = resolveBundledMcpScript( + "adapters/codex/local-tools-mcp-server.js", + ); + const ctxBase64 = Buffer.from(JSON.stringify(ctx)).toString("base64"); + const env = [ + { name: "POSTHOG_LOCAL_TOOLS_CTX", value: ctxBase64 }, + { name: "POSTHOG_LOCAL_TOOLS_ENABLED", value: enabledNames.join(",") }, + ]; + if (ctx.token) { + // Token also on the child env so its own git remote ops authenticate. + env.push( + ...Object.entries(ghTokenEnv(ctx.token)).map(([name, value]) => ({ + name, + value, + })), + ); + } + return { + name: LOCAL_TOOLS_MCP_NAME, + command: process.execPath, + args: [scriptPath], + env, + }; +} + +/** + * Returns the local-tools stdio server config to inject, or null when no tool's + * gate passes (e.g. local/desktop run with no GH token). Tools self-gate via the + * registry; the server is only injected when at least one passes. + */ +export function buildLocalToolsServer( + ctx: { cwd?: string }, + meta: LocalToolsMeta | undefined, +): McpServerStdio | null { + const cwd = ctx.cwd; + if (!cwd) { + return null; + } + const toolCtx: LocalToolCtx = { + cwd, + token: resolveGithubToken(), + taskId: resolveTaskId(meta), + baseBranch: meta?.baseBranch, + }; + const tools = enabledLocalTools(toolCtx, meta); + if (tools.length === 0) { + return null; + } + return toMcpServerStdio( + toolCtx, + tools.map((t) => t.name), + ); +} diff --git a/packages/agent/src/adapters/codex-app-server/mapping.test.ts b/packages/agent/src/adapters/codex-app-server/mapping.test.ts index fd4f1882d0..18454844b3 100644 --- a/packages/agent/src/adapters/codex-app-server/mapping.test.ts +++ b/packages/agent/src/adapters/codex-app-server/mapping.test.ts @@ -1,5 +1,9 @@ import { describe, expect, it } from "vitest"; -import { mapAppServerNotification } from "./mapping"; +import { + mapAppServerNotification, + mapHistoryItem, + parseUnifiedDiff, +} from "./mapping"; import { APP_SERVER_NOTIFICATIONS } from "./protocol"; describe("mapAppServerNotification", () => { @@ -7,7 +11,7 @@ describe("mapAppServerNotification", () => { const result = mapAppServerNotification( "s-1", APP_SERVER_NOTIFICATIONS.AGENT_MESSAGE_DELTA, - { itemId: "item_1", text: "Hello" }, + { itemId: "item_1", delta: "Hello" }, ); expect(result).toEqual({ @@ -19,7 +23,26 @@ describe("mapAppServerNotification", () => { }); }); - it("returns null when the text is missing or empty", () => { + it.each([ + ["raw textDelta", APP_SERVER_NOTIFICATIONS.REASONING_TEXT_DELTA], + ["summaryTextDelta", APP_SERVER_NOTIFICATIONS.REASONING_SUMMARY_TEXT_DELTA], + ])("maps a reasoning %s to an ACP agent_thought_chunk", (_label, method) => { + const result = mapAppServerNotification("s-1", method, { + itemId: "item_1", + delta: "thinking", + contentIndex: 0, + }); + + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "agent_thought_chunk", + content: { type: "text", text: "thinking" }, + }, + }); + }); + + it("returns null when the delta is missing or empty", () => { expect( mapAppServerNotification( "s-1", @@ -31,16 +54,586 @@ describe("mapAppServerNotification", () => { mapAppServerNotification( "s-1", APP_SERVER_NOTIFICATIONS.AGENT_MESSAGE_DELTA, - { itemId: "item_1", text: "" }, + { itemId: "item_1", delta: "" }, + ), + ).toBeNull(); + }); + + it("maps a started command execution item to a tool_call", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_STARTED, + { item: { type: "commandExecution", id: "i1", command: "ls -la" } }, + ); + + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call", + toolCallId: "i1", + title: "ls -la", + kind: "execute", + status: "in_progress", + }, + }); + }); + + it("maps a completed command execution item to a tool_call_update with output", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED, + { + item: { + type: "commandExecution", + id: "i1", + command: "ls", + status: "completed", + aggregatedOutput: "file.txt", + }, + }, + ); + + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call_update", + toolCallId: "i1", + status: "completed", + content: [ + { type: "content", content: { type: "text", text: "file.txt" } }, + ], + }, + }); + }); + + it("maps a started mcp tool call item, surfacing arguments as rawInput", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_STARTED, + { + item: { + type: "mcpToolCall", + id: "m1", + server: "posthog", + tool: "execute-sql", + arguments: { query: "SELECT 1" }, + }, + }, + ); + + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call", + toolCallId: "m1", + title: "posthog/execute-sql", + kind: "other", + status: "in_progress", + rawInput: { query: "SELECT 1" }, + _meta: { + posthog: { + toolName: "mcp__posthog__execute-sql", + mcp: { server: "posthog", tool: "execute-sql" }, + }, + }, + }, + }); + }); + + it("tags an mcp exec tool call with the structured posthog channel the renderer routes on", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_STARTED, + { + item: { + type: "mcpToolCall", + id: "m2", + server: "posthog", + tool: "exec", + arguments: { command: "call execute-sql {}" }, + }, + }, + ); + + const meta = (result?.update as { _meta?: unknown })._meta as { + posthog?: { toolName?: string; mcp?: { server: string; tool: string } }; + }; + expect(meta.posthog).toEqual({ + toolName: "mcp__posthog__exec", + mcp: { server: "posthog", tool: "exec" }, + }); + }); + + it("drops agent message items (their deltas already streamed)", () => { + expect( + mapAppServerNotification("s-1", APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED, { + item: { type: "agentMessage", id: "a1", text: "done" }, + }), + ).toBeNull(); + }); + + it("maps thread/tokenUsage/updated to a usage_update from the per-turn `last` (not cumulative `total`)", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.TOKEN_USAGE_UPDATED, + { + threadId: "t", + turnId: "u", + tokenUsage: { + total: { totalTokens: 1500, inputTokens: 1000, outputTokens: 500 }, + last: { + totalTokens: 600, + inputTokens: 500, + outputTokens: 100, + cachedInputTokens: 0, + reasoningOutputTokens: 0, + }, + modelContextWindow: 200000, + }, + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { sessionUpdate: "usage_update", used: 600, size: 200000 }, + }); + }); + + it("falls back to cumulative `total` when `last` is absent (pre-`last` build / turn 1)", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.TOKEN_USAGE_UPDATED, + { + threadId: "t", + turnId: "u", + tokenUsage: { + total: { totalTokens: 1500, inputTokens: 1000, outputTokens: 500 }, + modelContextWindow: 200000, + }, + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { sessionUpdate: "usage_update", used: 1500, size: 200000 }, + }); + }); + + it("maps turn/plan/updated to a plan update", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.TURN_PLAN_UPDATED, + { + threadId: "t", + turnId: "u", + plan: [ + { step: "Read files", status: "completed" }, + { step: "Edit", status: "inProgress" }, + ], + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "plan", + entries: [ + { content: "Read files", priority: "medium", status: "completed" }, + { content: "Edit", priority: "medium", status: "in_progress" }, + ], + }, + }); + }); + + it("maps a completed fileChange to a tool_call_update with diff content", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED, + { + item: { + type: "fileChange", + id: "f1", + status: "completed", + changes: [{ path: "a.txt", diff: "@@ -1 +1 @@\n-old\n+new" }], + }, + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call_update", + toolCallId: "f1", + status: "completed", + content: [ + { type: "diff", path: "a.txt", oldText: "old", newText: "new" }, + ], + }, + }); + }); + + it("includes cwd as a follow-along location on a started command execution", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_STARTED, + { + item: { + type: "commandExecution", + id: "c1", + command: "pytest", + cwd: "/repo", + }, + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call", + toolCallId: "c1", + title: "pytest", + kind: "execute", + status: "in_progress", + locations: [{ path: "/repo" }], + }, + }); + }); + + it("prefers command-action paths over cwd for read commands", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_STARTED, + { + item: { + type: "commandExecution", + id: "c2", + command: "cat foo.txt", + cwd: "/repo", + commandActions: [ + { type: "read", path: "/repo/foo.txt" }, + { type: "read", path: "/repo/foo.txt" }, + ], + }, + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call", + toolCallId: "c2", + title: "cat foo.txt", + kind: "read", + status: "in_progress", + locations: [{ path: "/repo/foo.txt" }], + }, + }); + }); + + it("titles a started fileChange with its path and exposes locations", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_STARTED, + { + item: { + type: "fileChange", + id: "f2", + changes: [{ path: "src/a.ts" }, { path: "src/b.ts" }], + }, + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call", + toolCallId: "f2", + title: "src/a.ts (+1 more)", + kind: "edit", + status: "in_progress", + locations: [{ path: "src/a.ts" }, { path: "src/b.ts" }], + }, + }); + }); + + it("streams command output deltas as in-progress tool_call_update text", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.COMMAND_OUTPUT_DELTA, + { threadId: "t", turnId: "u", itemId: "c1", delta: "line 1\n" }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call_update", + toolCallId: "c1", + status: "in_progress", + content: [ + { type: "content", content: { type: "text", text: "line 1\n" } }, + ], + }, + }); + }); + + it("echoes terminal interaction stdin into the tool call output", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.TERMINAL_INTERACTION, + { + threadId: "t", + turnId: "u", + itemId: "c1", + processId: "p1", + stdin: "y\n", + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call_update", + toolCallId: "c1", + status: "in_progress", + content: [{ type: "content", content: { type: "text", text: "y\n" } }], + }, + }); + }); + + it("returns null for an output delta missing itemId or delta", () => { + expect( + mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.COMMAND_OUTPUT_DELTA, + { itemId: "c1", delta: "" }, ), ).toBeNull(); + expect( + mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.COMMAND_OUTPUT_DELTA, + { delta: "x" }, + ), + ).toBeNull(); + }); + + it("streams fileChange patch updates as in-progress diff content", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.FILE_CHANGE_PATCH_UPDATED, + { + threadId: "t", + turnId: "u", + itemId: "f1", + changes: [ + { + path: "a.txt", + kind: { type: "update" }, + diff: "@@ -1 +1 @@\n-x\n+y", + }, + ], + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call_update", + toolCallId: "f1", + status: "in_progress", + content: [{ type: "diff", path: "a.txt", oldText: "x", newText: "y" }], + }, + }); }); - it("returns null for notifications not yet mapped in the spike", () => { + it("returns null for the turn completion notification", () => { expect( mapAppServerNotification("s-1", APP_SERVER_NOTIFICATIONS.TURN_COMPLETED, { - usage: { input_tokens: 10 }, + turn: { status: "completed" }, }), ).toBeNull(); }); }); + +describe("mapHistoryItem", () => { + it("replays a userMessage's text inputs as user_message_chunks", () => { + expect( + mapHistoryItem("s-1", { + type: "userMessage", + id: "u1", + content: [ + { type: "text", text: "hello", text_elements: [] }, + { type: "image", url: "data:image/png;base64,AAAA" }, + { type: "text", text: "world", text_elements: [] }, + ], + }), + ).toEqual([ + { + sessionId: "s-1", + update: { + sessionUpdate: "user_message_chunk", + content: { type: "text", text: "hello" }, + }, + }, + { + sessionId: "s-1", + update: { + sessionUpdate: "user_message_chunk", + content: { type: "text", text: "world" }, + }, + }, + ]); + }); + + it("replays an agentMessage as an agent_message_chunk", () => { + expect( + mapHistoryItem("s-1", { type: "agentMessage", id: "a1", text: "done" }), + ).toEqual([ + { + sessionId: "s-1", + update: { + sessionUpdate: "agent_message_chunk", + content: { type: "text", text: "done" }, + }, + }, + ]); + }); + + it("replays a completed command as one tool_call carrying status + output", () => { + expect( + mapHistoryItem("s-1", { + type: "commandExecution", + id: "c1", + command: "ls -la", + status: "completed", + commandActions: [{ type: "read", path: "/repo/a.ts" }], + aggregatedOutput: "a.ts\n", + }), + ).toEqual([ + { + sessionId: "s-1", + update: { + sessionUpdate: "tool_call", + toolCallId: "c1", + title: "ls -la", + kind: "read", + status: "completed", + locations: [{ path: "/repo/a.ts" }], + content: [ + { type: "content", content: { type: "text", text: "a.ts\n" } }, + ], + }, + }, + ]); + }); + + it("replays a fileChange as a tool_call with diff content", () => { + const [update] = mapHistoryItem("s-1", { + type: "fileChange", + id: "f1", + status: "completed", + changes: [{ path: "a.txt", diff: "-x\n+y", kind: "modify" }], + }); + expect(update.update).toMatchObject({ + sessionUpdate: "tool_call", + toolCallId: "f1", + kind: "edit", + status: "completed", + content: [{ type: "diff", path: "a.txt", oldText: "x", newText: "y" }], + }); + }); + + it("does not replay ephemeral reasoning/plan items", () => { + expect(mapHistoryItem("s-1", { type: "reasoning", id: "r1" })).toEqual([]); + expect( + mapHistoryItem("s-1", { type: "plan", id: "p1", text: "the plan" }), + ).toEqual([]); + }); +}); + +describe("parseUnifiedDiff", () => { + it("keeps added/removed content lines whose payload starts with ++ or --", () => { + expect(parseUnifiedDiff("@@ -1 +1 @@\n---count;\n+++count;")).toEqual({ + oldText: "--count;", + newText: "++count;", + }); + }); + + it("skips file headers and the no-newline marker", () => { + expect( + parseUnifiedDiff( + "--- a/x.ts\n+++ b/x.ts\n@@ -1 +1 @@\n-old\n+new\n\\ No newline at end of file", + ), + ).toEqual({ oldText: "old", newText: "new" }); + }); +}); + +describe("mcpToolCall result rendering", () => { + it("renders a completed mcpToolCall's result content as text", () => { + expect( + mapAppServerNotification("s-1", APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED, { + item: { + type: "mcpToolCall", + id: "m1", + server: "posthog", + tool: "query", + status: "completed", + arguments: { sql: "SELECT 1" }, + result: { content: [{ type: "text", text: "42 rows" }] }, + }, + }), + ).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call_update", + toolCallId: "m1", + status: "completed", + content: [ + { type: "content", content: { type: "text", text: "42 rows" } }, + ], + }, + }); + }); + + it("renders a failed mcpToolCall's error message", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED, + { + item: { + type: "mcpToolCall", + id: "m2", + server: "x", + tool: "y", + status: "failed", + error: { message: "boom" }, + }, + }, + ); + expect(result?.update).toMatchObject({ + sessionUpdate: "tool_call_update", + toolCallId: "m2", + status: "failed", + content: [{ type: "content", content: { type: "text", text: "boom" } }], + }); + }); + + it("renders a dynamicToolCall (not dropped) with its inputText output", () => { + const result = mapAppServerNotification( + "s-1", + APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED, + { + item: { + type: "dynamicToolCall", + id: "d1", + namespace: "ns", + tool: "doit", + status: "completed", + arguments: { x: 1 }, + contentItems: [{ type: "inputText", text: "result" }], + }, + }, + ); + expect(result).toEqual({ + sessionId: "s-1", + update: { + sessionUpdate: "tool_call_update", + toolCallId: "d1", + status: "completed", + content: [ + { type: "content", content: { type: "text", text: "result" } }, + ], + }, + }); + }); +}); diff --git a/packages/agent/src/adapters/codex-app-server/mapping.ts b/packages/agent/src/adapters/codex-app-server/mapping.ts index d282981e14..46c6148b86 100644 --- a/packages/agent/src/adapters/codex-app-server/mapping.ts +++ b/packages/agent/src/adapters/codex-app-server/mapping.ts @@ -1,14 +1,15 @@ -import type { SessionNotification } from "@agentclientprotocol/sdk"; +import type { + SessionNotification, + ToolCallContent, + ToolCallLocation, +} from "@agentclientprotocol/sdk"; +import { mcpToolKey, posthogToolMeta } from "@posthog/shared"; import { APP_SERVER_NOTIFICATIONS } from "./protocol"; /** - * Translates a native app-server notification into an ACP SessionNotification - * so the rest of PostHog Code, which speaks ACP, stays unchanged. - * - * Spike scope: only the streaming agent-message path is mapped, which is what - * Phase A proves end to end. item/tool events, token usage and approvals are - * mapped in Phase B once the generated schema pins their exact shapes. - * Notifications without a mapping return null and are dropped. + * Translates a native app-server notification into an ACP SessionNotification. + * Streamed text maps to chunks; tool-like items map to `tool_call`/`tool_call_update`. + * Agent-message and reasoning items are dropped — their deltas already streamed. */ export function mapAppServerNotification( sessionId: string, @@ -17,22 +18,501 @@ export function mapAppServerNotification( ): SessionNotification | null { switch (method) { case APP_SERVER_NOTIFICATIONS.AGENT_MESSAGE_DELTA: { - // `item/agentMessage/delta` carries { itemId, text }. - const text = readStringField(params, "text"); - if (!text) return null; + const delta = readStringField(params, "delta"); + if (!delta) return null; return { sessionId, update: { sessionUpdate: "agent_message_chunk", - content: { type: "text", text }, + content: { type: "text", text: delta }, }, }; } + case APP_SERVER_NOTIFICATIONS.REASONING_TEXT_DELTA: + case APP_SERVER_NOTIFICATIONS.REASONING_SUMMARY_TEXT_DELTA: { + const delta = readStringField(params, "delta"); + if (!delta) return null; + return { + sessionId, + update: { + sessionUpdate: "agent_thought_chunk", + content: { type: "text", text: delta }, + }, + }; + } + case APP_SERVER_NOTIFICATIONS.TOKEN_USAGE_UPDATED: { + // Context indicator: renderer reads `used`/`size`; detailed breakdown comes via `_posthog/usage_update`. + const tu = (params as { tokenUsage?: any })?.tokenUsage; + // Use this turn's `last`, not cumulative `total` (which over-reports and pegs the + // gauge); `total` is the fallback for pre-`last` builds. + const context = tu?.last ?? tu?.total; + const used = context?.totalTokens ?? context?.inputTokens; + if (used == null) return null; + const size = tu?.modelContextWindow; + // `usage_update` is a PostHog-convention update, not in the ACP union. + return { + sessionId, + update: { + sessionUpdate: "usage_update", + used, + ...(size != null ? { size } : {}), + }, + } as unknown as SessionNotification; + } + case APP_SERVER_NOTIFICATIONS.TURN_PLAN_UPDATED: { + const plan = ( + params as { plan?: Array<{ step?: string; status?: string }> } + )?.plan; + if (!Array.isArray(plan)) return null; + return { + sessionId, + update: { + sessionUpdate: "plan", + entries: plan.map((s) => ({ + content: s.step ?? "", + priority: "medium", + status: mapPlanStatus(s.status), + })), + }, + } as unknown as SessionNotification; + } + case APP_SERVER_NOTIFICATIONS.ITEM_STARTED: + case APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED: { + const item = readItem(params); + if (!item) return null; + return mapItem( + sessionId, + item, + method === APP_SERVER_NOTIFICATIONS.ITEM_COMPLETED, + ); + } + case APP_SERVER_NOTIFICATIONS.COMMAND_OUTPUT_DELTA: { + const itemId = readStringField(params, "itemId"); + const delta = readStringField(params, "delta"); + if (!itemId || !delta) return null; + return toolOutputChunk(sessionId, itemId, delta); + } + case APP_SERVER_NOTIFICATIONS.TERMINAL_INTERACTION: { + const itemId = readStringField(params, "itemId"); + const stdin = readStringField(params, "stdin"); + if (!itemId || !stdin) return null; + return toolOutputChunk(sessionId, itemId, stdin); + } + case APP_SERVER_NOTIFICATIONS.FILE_CHANGE_PATCH_UPDATED: { + const itemId = readStringField(params, "itemId"); + if (!itemId) return null; + const changes = (params as { changes?: AppServerItem["changes"] }) + ?.changes; + const content = diffContent(changes); + if (!content) return null; + return { + sessionId, + update: { + sessionUpdate: "tool_call_update", + toolCallId: itemId, + status: "in_progress", + content, + }, + }; + } + default: + return null; + } +} + +/** A streamed text chunk on an in-progress tool call; the renderer appends successive single-chunk updates. */ +function toolOutputChunk( + sessionId: string, + toolCallId: string, + text: string, +): SessionNotification { + return { + sessionId, + update: { + sessionUpdate: "tool_call_update", + toolCallId, + status: "in_progress", + content: [{ type: "content", content: { type: "text", text } }], + }, + }; +} + +function mapPlanStatus( + status: string | undefined, +): "pending" | "in_progress" | "completed" { + if (status === "inProgress") return "in_progress"; + if (status === "completed") return "completed"; + return "pending"; +} + +/** + * Extracts {oldText,newText} from a unified diff so a codex `fileChange` renders as an ACP diff. + * Cosmetic limit: a content line whose payload begins with "-- "/"++ " is misread as a header and dropped. + */ +export function parseUnifiedDiff(diff: string): { + oldText: string; + newText: string; +} { + const oldLines: string[] = []; + const newLines: string[] = []; + for (const line of diff.split("\n")) { + // Skip diff/hunk metadata; match trailing space on ---/+++ so content lines like "++i;" aren't dropped. + if ( + line.startsWith("@@") || + line.startsWith("diff ") || + line.startsWith("index ") || + line.startsWith("--- ") || + line.startsWith("+++ ") || + line.startsWith("\\ ") + ) { + continue; + } + if (line.startsWith("-")) oldLines.push(line.slice(1)); + else if (line.startsWith("+")) newLines.push(line.slice(1)); + else { + const ctx = line.startsWith(" ") ? line.slice(1) : line; + oldLines.push(ctx); + newLines.push(ctx); + } + } + return { oldText: oldLines.join("\n"), newText: newLines.join("\n") }; +} + +export type AppServerItem = { + type?: string; + id?: string; + command?: string; + cwd?: string; + commandActions?: Array<{ type?: string; path?: string } | string>; + server?: string; + tool?: string; + namespace?: string | null; + contentItems?: unknown; + query?: string; + status?: string; + arguments?: unknown; + aggregatedOutput?: string | null; + changes?: Array<{ path?: string; diff?: string; kind?: unknown }>; + result?: { content?: unknown } | null; + error?: { message?: string } | null; + // Present on message/reasoning items replayed from thread history. + text?: string; + content?: unknown; +}; + +function mcpResultText( + result: AppServerItem["result"], + error: AppServerItem["error"], +): string | null { + if (error?.message) return error.message; + const content = result?.content; + if (!Array.isArray(content)) return null; + const text = content + .filter( + (c) => + c && typeof c === "object" && (c as { type?: string }).type === "text", + ) + .map((c) => (c as { text?: string }).text ?? "") + .filter(Boolean) + .join("\n"); + return text || null; +} + +function dynamicToolText(items: unknown): string | null { + if (!Array.isArray(items)) return null; + const text = items + .filter( + (c) => + c && + typeof c === "object" && + (c as { type?: string }).type === "inputText", + ) + .map((c) => (c as { text?: string }).text ?? "") + .filter(Boolean) + .join("\n"); + return text || null; +} + +/** + * Re-renders a persisted `ThreadItem` as the ACP updates a live stream would have produced, + * so a reattaching host shows the full transcript. Tool items collapse to one completed + * `tool_call`; ephemeral items (reasoning, plan) are not replayed. + */ +export function mapHistoryItem( + sessionId: string, + item: AppServerItem, +): SessionNotification[] { + switch (item.type) { + case "userMessage": + return userMessageChunks(sessionId, item.content); + case "agentMessage": + return item.text + ? [ + { + sessionId, + update: { + sessionUpdate: "agent_message_chunk", + content: { type: "text", text: item.text }, + }, + }, + ] + : []; + case "reasoning": + case "plan": + return []; + default: { + const tool = describeTool(item); + if (!tool || !item.id) return []; + const content = completedContent(item, tool); + return [ + { + sessionId, + update: { + sessionUpdate: "tool_call", + toolCallId: item.id, + title: tool.title, + kind: tool.kind, + status: mapStatus(item.status), + ...(tool.rawInput !== undefined ? { rawInput: tool.rawInput } : {}), + ...(tool.locations?.length ? { locations: tool.locations } : {}), + ...(tool.mcp + ? { + _meta: posthogToolMeta({ + toolName: mcpToolKey(tool.mcp), + mcp: tool.mcp, + }), + } + : {}), + ...(content ? { content } : {}), + }, + }, + ]; + } + } +} + +/** Replays a persisted `userMessage`'s text inputs; historical image attachments aren't re-rendered. */ +function userMessageChunks( + sessionId: string, + content: unknown, +): SessionNotification[] { + if (!Array.isArray(content)) return []; + const out: SessionNotification[] = []; + for (const block of content) { + if ( + block && + typeof block === "object" && + (block as { type?: string }).type === "text" + ) { + const text = (block as { text?: string }).text; + if (typeof text === "string" && text) { + out.push({ + sessionId, + update: { + sessionUpdate: "user_message_chunk", + content: { type: "text", text }, + }, + }); + } + } + } + return out; +} + +type ToolDescriptor = { + title: string; + kind: "execute" | "edit" | "fetch" | "other" | "read" | "search"; + rawInput?: unknown; + output?: string | null; + locations?: ToolCallLocation[]; + /** Originating MCP server + tool, surfaced on `_meta.posthog` so the renderer routes MCP rendering. */ + mcp?: { server: string; tool: string }; +}; + +/** Classify a shell command by its actions so read-only commands render as read/search, not execute. */ +function commandKind( + actions: AppServerItem["commandActions"], +): "read" | "search" | "execute" { + if (!actions?.length) return "execute"; + const types = actions.map((a) => (typeof a === "string" ? a : a?.type)); + if (types.every((t) => t === "read")) return "read"; + if (types.every((t) => t === "search" || t === "listFiles")) return "search"; + return "execute"; +} + +function describeTool(item: AppServerItem): ToolDescriptor | null { + switch (item.type) { + case "commandExecution": + return { + title: item.command ?? "Run command", + kind: commandKind(item.commandActions), + output: item.aggregatedOutput ?? null, + locations: commandLocations(item), + }; + case "fileChange": { + const paths = changePaths(item.changes); + return { + title: fileChangeTitle(paths), + kind: "edit", + locations: paths.map((path) => ({ path })), + }; + } + case "mcpToolCall": + return { + title: `${item.server ?? "mcp"}/${item.tool ?? "tool"}`, + kind: "other", + rawInput: item.arguments, + output: mcpResultText(item.result, item.error), + mcp: { server: item.server ?? "mcp", tool: item.tool ?? "tool" }, + }; + case "dynamicToolCall": + return { + title: item.namespace + ? `${item.namespace}/${item.tool ?? "tool"}` + : (item.tool ?? "tool"), + kind: "other", + rawInput: item.arguments, + output: dynamicToolText(item.contentItems), + }; + case "webSearch": + return { title: item.query ?? "Web search", kind: "fetch" }; default: return null; } } +/** Distinct, non-empty changed paths for a fileChange item, order-preserved. */ +export function changePaths(changes: AppServerItem["changes"]): string[] { + if (!changes?.length) return []; + const seen = new Set(); + const paths: string[] = []; + for (const change of changes) { + const path = change?.path; + if (path && !seen.has(path)) { + seen.add(path); + paths.push(path); + } + } + return paths; +} + +function fileChangeTitle(paths: string[]): string { + if (!paths.length) return "Edit files"; + if (paths.length === 1) return paths[0]; + return `${paths[0]} (+${paths.length - 1} more)`; +} + +/** Clickable locations for a commandExecution: action paths, else the cwd as a fallback. */ +function commandLocations(item: AppServerItem): ToolCallLocation[] | undefined { + const paths: string[] = []; + const seen = new Set(); + for (const action of item.commandActions ?? []) { + const path = typeof action === "string" ? undefined : action?.path; + if (path && !seen.has(path)) { + seen.add(path); + paths.push(path); + } + } + if (!paths.length && item.cwd) paths.push(item.cwd); + if (!paths.length) return undefined; + return paths.map((path) => ({ path })); +} + +function mapItem( + sessionId: string, + item: AppServerItem, + completed: boolean, +): SessionNotification | null { + const tool = describeTool(item); + if (!tool || !item.id) { + return null; + } + + if (!completed) { + return { + sessionId, + update: { + sessionUpdate: "tool_call", + toolCallId: item.id, + title: tool.title, + kind: tool.kind, + status: "in_progress", + ...(tool.rawInput !== undefined ? { rawInput: tool.rawInput } : {}), + ...(tool.locations?.length ? { locations: tool.locations } : {}), + ...(tool.mcp + ? { + _meta: posthogToolMeta({ + toolName: mcpToolKey(tool.mcp), + mcp: tool.mcp, + }), + } + : {}), + }, + }; + } + + const content = completedContent(item, tool); + return { + sessionId, + update: { + sessionUpdate: "tool_call_update", + toolCallId: item.id, + status: mapStatus(item.status), + ...(content ? { content } : {}), + }, + }; +} + +function completedContent( + item: AppServerItem, + tool: ToolDescriptor, +): ToolCallContent[] | undefined { + if (item.type === "fileChange") { + const diffs = diffContent(item.changes); + if (diffs) return diffs; + } + if (tool.output) { + return [{ type: "content", content: { type: "text", text: tool.output } }]; + } + return undefined; +} + +/** Maps a fileChange's `changes[]` to ACP `diff` content blocks. */ +export function diffContent( + changes: AppServerItem["changes"], +): ToolCallContent[] | undefined { + if (!changes?.length) return undefined; + const diffs = changes + .filter((c) => c?.diff) + .map( + (c) => + ({ + type: "diff", + path: c.path, + ...parseUnifiedDiff(c.diff ?? ""), + }) as unknown as ToolCallContent, + ); + return diffs.length ? diffs : undefined; +} + +function mapStatus( + status: string | undefined, +): "completed" | "failed" | "in_progress" { + if (status === "completed") return "completed"; + if (status === "failed" || status === "declined") return "failed"; + return "in_progress"; +} + +function readItem(params: unknown): AppServerItem | null { + if (params && typeof params === "object" && "item" in params) { + const item = (params as Record).item; + if (item && typeof item === "object") { + return item as AppServerItem; + } + } + return null; +} + function readStringField(params: unknown, key: string): string | null { if (params && typeof params === "object" && key in params) { const value = (params as Record)[key]; diff --git a/packages/agent/src/adapters/codex-app-server/mcp-config.test.ts b/packages/agent/src/adapters/codex-app-server/mcp-config.test.ts new file mode 100644 index 0000000000..912d253b38 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/mcp-config.test.ts @@ -0,0 +1,60 @@ +import type { McpServer } from "@agentclientprotocol/sdk"; +import { describe, expect, it } from "vitest"; +import { toCodexMcpServers } from "./mcp-config"; + +describe("toCodexMcpServers", () => { + it("returns undefined for empty input", () => { + expect(toCodexMcpServers(undefined)).toBeUndefined(); + expect(toCodexMcpServers([])).toBeUndefined(); + }); + + it("translates a stdio server, folding env pairs into a map", () => { + const servers = [ + { + name: "posthog", + command: "node", + args: ["server.js"], + env: [ + { name: "TOKEN", value: "abc" }, + { name: "BASE", value: "http://x" }, + ], + }, + ] as unknown as McpServer[]; + + expect(toCodexMcpServers(servers)).toEqual({ + posthog: { + command: "node", + args: ["server.js"], + env: { TOKEN: "abc", BASE: "http://x" }, + }, + }); + }); + + it("omits env when there are no pairs", () => { + const servers = [ + { name: "bare", command: "run", args: [], env: [] }, + ] as unknown as McpServer[]; + + expect(toCodexMcpServers(servers)).toEqual({ + bare: { command: "run", args: [] }, + }); + }); + + it("translates an http server, folding headers into http_headers", () => { + const servers = [ + { + type: "http", + name: "remote", + url: "https://mcp.example/mcp", + headers: [{ name: "Authorization", value: "Bearer t" }], + }, + ] as unknown as McpServer[]; + + expect(toCodexMcpServers(servers)).toEqual({ + remote: { + url: "https://mcp.example/mcp", + http_headers: { Authorization: "Bearer t" }, + }, + }); + }); +}); diff --git a/packages/agent/src/adapters/codex-app-server/mcp-config.ts b/packages/agent/src/adapters/codex-app-server/mcp-config.ts new file mode 100644 index 0000000000..4e4873e516 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/mcp-config.ts @@ -0,0 +1,55 @@ +import type { McpServer } from "@agentclientprotocol/sdk"; + +/** + * Codex's per-thread `mcp_servers` config entry (stdio: command/args/env; http: + * url + headers), accepted under `thread/start`'s `config.mcp_servers`. + */ +export type CodexMcpServerConfig = + | { command: string; args: string[]; env?: Record } + | { url: string; http_headers?: Record }; + +/** + * Translates the ACP `McpServer[]` into the shape Codex's app-server expects under + * `config.mcp_servers` — ACP encodes env/headers as `{ name, value }[]`, Codex + * wants plain string maps. Returns undefined when there's nothing to inject. + */ +export function toCodexMcpServers( + servers: McpServer[] | undefined, +): Record | undefined { + if (!servers || servers.length === 0) { + return undefined; + } + + const out: Record = {}; + for (const server of servers) { + if ("command" in server && server.command) { + const env = pairsToRecord(server.env); + out[server.name] = { + command: server.command, + args: server.args ?? [], + ...(env ? { env } : {}), + }; + } else if ("url" in server && server.url) { + const headers = pairsToRecord(server.headers); + out[server.name] = { + url: server.url, + ...(headers ? { http_headers: headers } : {}), + }; + } + } + + return Object.keys(out).length > 0 ? out : undefined; +} + +function pairsToRecord( + pairs: Array<{ name: string; value: string }> | undefined, +): Record | undefined { + if (!pairs || pairs.length === 0) { + return undefined; + } + const record: Record = {}; + for (const { name, value } of pairs) { + record[name] = value; + } + return record; +} diff --git a/packages/agent/src/adapters/codex-app-server/mcp-manager.ts b/packages/agent/src/adapters/codex-app-server/mcp-manager.ts new file mode 100644 index 0000000000..6faae9f47d --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/mcp-manager.ts @@ -0,0 +1,50 @@ +/** An MCP tool call codex is running: its server, tool, and arguments. */ +export interface McpCall { + server: string; + tool: string; + args: unknown; +} + +/** + * Correlates codex approval prompts back to the MCP tool that triggered them: by + * item id for a command approval, or by server name for an elicitation (which + * carries no id, so it maps to the latest in-flight call — MCP calls are sequential). + */ +export class McpManager { + private readonly byId = new Map(); + private latest?: McpCall; + + /** Record an `mcpToolCall` item from an item/started or item/completed notification. */ + capture(params: unknown): void { + const item = ( + params as { + item?: { + type?: string; + id?: string; + server?: string; + tool?: string; + arguments?: unknown; + }; + } + )?.item; + if (item?.type === "mcpToolCall" && item.id && item.server && item.tool) { + const call: McpCall = { + server: item.server, + tool: item.tool, + args: item.arguments, + }; + this.byId.set(item.id, call); + this.latest = call; + } + } + + /** The MCP call for a command-execution approval's item id, if known. */ + byItemId(itemId: string | undefined): McpCall | undefined { + return itemId ? this.byId.get(itemId) : undefined; + } + + /** The in-flight MCP call for an elicitation's server (elicitations carry no item id). */ + byServer(serverName: string): McpCall | undefined { + return this.latest?.server === serverName ? this.latest : undefined; + } +} diff --git a/packages/agent/src/adapters/codex-app-server/protocol.ts b/packages/agent/src/adapters/codex-app-server/protocol.ts index 0448513366..bd948e9a34 100644 --- a/packages/agent/src/adapters/codex-app-server/protocol.ts +++ b/packages/agent/src/adapters/codex-app-server/protocol.ts @@ -1,14 +1,7 @@ /** - * Minimal typings for the native Codex `app-server` JSON-RPC protocol. - * - * Method names and message shapes follow the documented protocol - * (https://developers.openai.com/codex/app-server). The wire framing is - * newline-delimited JSON that follows JSON-RPC 2.0 structure but omits the - * `"jsonrpc": "2.0"` header on the wire. - * - * Spike scope: param/result shapes are still partial. Generate the exact, - * version-pinned schema with `codex app-server generate-ts` once the codex - * binary is bundled, then tighten these. + * Minimal typings for the native Codex `app-server` JSON-RPC protocol + * (https://developers.openai.com/codex/app-server). Wire framing is + * newline-delimited JSON that omits the `"jsonrpc": "2.0"` header. */ export const APP_SERVER_METHODS = { @@ -17,27 +10,56 @@ export const APP_SERVER_METHODS = { THREAD_RESUME: "thread/resume", THREAD_FORK: "thread/fork", TURN_START: "turn/start", + // Inject input into the active turn (mirrors Claude's mid-turn steering); fails unless `expectedTurnId` matches. + TURN_STEER: "turn/steer", TURN_INTERRUPT: "turn/interrupt", + MODEL_LIST: "model/list", + SKILLS_LIST: "skills/list", + THREAD_LIST: "thread/list", } as const; export const APP_SERVER_NOTIFICATIONS = { INITIALIZED: "initialized", THREAD_STARTED: "thread/started", + // Carries the active turn id — precondition for turn/steer + turn/interrupt. + TURN_STARTED: "turn/started", ITEM_STARTED: "item/started", ITEM_COMPLETED: "item/completed", AGENT_MESSAGE_DELTA: "item/agentMessage/delta", + REASONING_TEXT_DELTA: "item/reasoning/textDelta", + // Default reasoning stream for gpt-5 models; raw textDelta is off by default, so without this the host sees no reasoning. + REASONING_SUMMARY_TEXT_DELTA: "item/reasoning/summaryTextDelta", + TURN_PLAN_UPDATED: "turn/plan/updated", TURN_COMPLETED: "turn/completed", + // Fatal turn error; `willRetry:false` means it won't recover on its own. + ERROR: "error", TOKEN_USAGE_UPDATED: "thread/tokenUsage/updated", + // codex auto-compacted the thread; mirrors Claude's compact_boundary so the host's context indicator + queue drain fire. + CONTEXT_COMPACTED: "thread/compacted", + COMMAND_OUTPUT_DELTA: "item/commandExecution/outputDelta", + // PTY-level stdin echoed back for an interactive terminal command. + TERMINAL_INTERACTION: "item/commandExecution/terminalInteraction", + FILE_CHANGE_PATCH_UPDATED: "item/fileChange/patchUpdated", } as const; -/** Server-initiated requests the client must answer (approvals). */ +/** + * Server-initiated requests the client must answer. The two approvals are yes/no + * decisions; the richer requests carry distinct response shapes (multi-question + * prompt, permission-profile grant, MCP elicitation). + */ export const APP_SERVER_REQUESTS = { COMMAND_APPROVAL: "item/commandExecution/requestApproval", FILE_CHANGE_APPROVAL: "item/fileChange/requestApproval", + TOOL_USER_INPUT: "item/tool/requestUserInput", + PERMISSIONS_APPROVAL: "item/permissions/requestApproval", + MCP_ELICITATION: "mcpServer/elicitation/request", } as const; +/** JSON-RPC ids are `string | number` per the codex schema (`RequestId.ts`). */ +export type RequestId = string | number; + export interface JsonRpcRequest { - id: number; + id: RequestId; method: string; params?: unknown; } @@ -54,7 +76,7 @@ export interface JsonRpcError { } export interface JsonRpcResponse { - id: number; + id: RequestId; result?: unknown; error?: JsonRpcError; } diff --git a/packages/agent/src/adapters/codex-app-server/session-config.test.ts b/packages/agent/src/adapters/codex-app-server/session-config.test.ts new file mode 100644 index 0000000000..08fbb428ab --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/session-config.test.ts @@ -0,0 +1,166 @@ +import { describe, expect, it } from "vitest"; +import { + buildConfigOptions, + CODEX_MODES, + collaborationModeFor, + DEFAULT_EFFORTS, + modeApprovalPolicy, + sandboxPolicyFor, +} from "./session-config"; + +describe("modeApprovalPolicy", () => { + it.each([ + ["read-only", "untrusted"], + ["auto", "on-request"], + ["full-access", "never"], + ])("maps mode %s to approval policy %s", (mode, policy) => { + expect(modeApprovalPolicy(mode)).toBe(policy); + }); + + it("returns undefined for an unknown mode", () => { + expect(modeApprovalPolicy("nonsense")).toBeUndefined(); + expect(modeApprovalPolicy(undefined)).toBeUndefined(); + }); + + it("every CODEX_MODES entry has a resolvable policy", () => { + for (const mode of CODEX_MODES) { + expect(modeApprovalPolicy(mode.id)).toBe(mode.approvalPolicy); + } + }); +}); + +describe("sandboxPolicyFor", () => { + it("restricts plan + read-only to a read-only sandbox", () => { + expect(sandboxPolicyFor("plan")).toEqual({ + type: "readOnly", + networkAccess: true, + }); + expect(sandboxPolicyFor("read-only")).toEqual({ + type: "readOnly", + networkAccess: true, + }); + }); + + it("leaves auto + full-access at the spawned full-access sandbox (no override)", () => { + expect(sandboxPolicyFor("auto")).toBeUndefined(); + expect(sandboxPolicyFor("full-access")).toBeUndefined(); + }); + + it("returns undefined for unknown ids", () => { + expect(sandboxPolicyFor("bypassPermissions")).toBeUndefined(); + expect(sandboxPolicyFor(undefined)).toBeUndefined(); + }); +}); + +describe("collaborationModeFor", () => { + it("maps only Plan to codex's plan collaboration; everything else is default", () => { + expect(collaborationModeFor("plan")).toBe("plan"); + expect(collaborationModeFor("read-only")).toBe("default"); + expect(collaborationModeFor("auto")).toBe("default"); + expect(collaborationModeFor("full-access")).toBe("default"); + expect(collaborationModeFor(undefined)).toBe("default"); + }); +}); + +describe("buildConfigOptions", () => { + const byCategory = ( + opts: ReturnType, + category: string, + ) => + opts.find((o) => (o as { category: string }).category === category) as { + currentValue: string; + options: Array<{ value: string; name: string }>; + }; + + it("emits mode + model + thought_level selectors from the live lists", () => { + const opts = buildConfigOptions({ + mode: "auto", + model: "gpt-5.5", + effort: "high", + models: [ + { id: "gpt-5.5", name: "GPT-5.5" }, + { id: "gpt-5-mini", name: "GPT-5 mini" }, + ], + efforts: ["low", "high"], + }); + expect(opts.map((o) => (o as { category: string }).category)).toEqual([ + "mode", + "model", + "thought_level", + ]); + const model = byCategory(opts, "model"); + expect(model.currentValue).toBe("gpt-5.5"); + expect(model.options.map((o) => o.value)).toEqual([ + "gpt-5.5", + "gpt-5-mini", + ]); + }); + + it("surfaces the flattened codex presets (incl. Plan) with the current mode selected", () => { + const mode = byCategory( + buildConfigOptions({ + mode: "plan", + model: "gpt-5.5", + models: [], + efforts: [], + }), + "mode", + ); + expect(mode.currentValue).toBe("plan"); + expect(mode.options.map((o) => o.value)).toEqual([ + "plan", + "read-only", + "auto", + "full-access", + ]); + }); + + it("keeps the active model/effort selectable even if the lists omit them", () => { + const opts = buildConfigOptions({ + mode: "auto", + model: "gpt-5.5", + effort: "max", + models: [{ id: "gpt-5-mini", name: "GPT-5 mini" }], + efforts: ["low", "high"], + }); + const model = byCategory(opts, "model"); + const effort = byCategory(opts, "thought_level"); + expect(model.currentValue).toBe("gpt-5.5"); + expect(model.options.map((o) => o.value)).toContain("gpt-5.5"); + expect(effort.currentValue).toBe("max"); + expect(effort.options.map((o) => o.value)).toContain("max"); + }); + + it("humanizes reasoning-effort labels (Title case) while keeping raw values", () => { + const effort = byCategory( + buildConfigOptions({ + mode: "auto", + model: "gpt-5.5", + effort: "high", + models: [], + efforts: ["low", "medium", "high"], + }), + "thought_level", + ); + expect(effort.options).toEqual([ + { name: "Low", value: "low" }, + { name: "Medium", value: "medium" }, + { name: "High", value: "high" }, + ]); + }); + + it("falls back to the single current model and DEFAULT_EFFORTS when lists are empty", () => { + const opts = buildConfigOptions({ + mode: "auto", + model: "gpt-5.5", + models: [], + efforts: [], + }); + expect(byCategory(opts, "model").options).toEqual([ + { name: "gpt-5.5", value: "gpt-5.5" }, + ]); + expect( + byCategory(opts, "thought_level").options.map((o) => o.value), + ).toEqual(DEFAULT_EFFORTS); + }); +}); diff --git a/packages/agent/src/adapters/codex-app-server/session-config.ts b/packages/agent/src/adapters/codex-app-server/session-config.ts new file mode 100644 index 0000000000..dea07c69d6 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/session-config.ts @@ -0,0 +1,325 @@ +import type { SessionConfigOption } from "@agentclientprotocol/sdk"; +import { type GatewayModel, isOpenAIModel } from "../../gateway-models"; +import { getReasoningEffortOptions } from "../codex/models"; + +/** + * Session config + mode synthesis for the codex app-server adapter. The native + * app-server has no "mode" RPC (a thread is configured by `approvalPolicy` + + * `sandbox`), so modes are synthesized here and applied per-turn. + */ + +/** + * Per-turn sandbox the mode maps to (subset of codex's SandboxPolicy). This is + * what makes read-only/plan actually block edits — `approvalPolicy` alone is + * neutralized because the process spawns editable. + */ +export type CodexSandboxPolicy = + | { type: "readOnly"; networkAccess: boolean } + | { type: "dangerFullAccess" }; + +export interface CodexMode { + id: string; + name: string; + description: string; + /** codex AskForApproval the mode maps to, applied per-turn on turn/start. */ + approvalPolicy: string; + /** + * Per-turn sandbox override; undefined keeps the spawned editable sandbox. + * Only applied off the cloud sandbox, where a non-danger policy would re-engage + * the unavailable linux-sandbox and panic. + */ + sandboxPolicy?: CodexSandboxPolicy; + /** + * codex's native collaboration mode (per-turn on `turn/start`). "plan" unlocks + * plan proposals + `request_user_input`; everything else runs "default". + */ + collaborationMode?: "plan" | "default"; + /** + * codex's named permission profile (per-turn `activePermissionProfile.extends`). + * codex 0.140.0 enforces the sandbox through these built-in profiles; the raw + * `sandboxPolicy` is no longer honored alone. Undefined keeps the spawned default. + */ + permissionProfile?: string; +} + +// Flattened Claude-style presets. Restriction is driven by approvalPolicy + the +// named permissionProfile (codex 0.140.0's enforced sandbox lever); plan/read-only +// block edits, auto/full-access keep the spawned editable sandbox. +export const CODEX_MODES: CodexMode[] = [ + { + id: "plan", + name: "Plan", + description: "Plan first — inspect and propose; makes no changes", + approvalPolicy: "on-request", + sandboxPolicy: { type: "readOnly", networkAccess: true }, + permissionProfile: ":read-only", + collaborationMode: "plan", + }, + { + id: "read-only", + name: "Read only", + description: "Read-only — can inspect but not modify files", + approvalPolicy: "untrusted", + sandboxPolicy: { type: "readOnly", networkAccess: true }, + permissionProfile: ":read-only", + }, + { + id: "auto", + name: "Auto", + description: "Edits the workspace; asks before risky operations", + approvalPolicy: "on-request", + }, + { + id: "full-access", + name: "Full access", + description: "Auto-approves all operations", + approvalPolicy: "never", + }, +]; + +export const DEFAULT_MODE = "auto"; + +export function modeApprovalPolicy( + modeId: string | undefined, +): string | undefined { + return CODEX_MODES.find((m) => m.id === modeId)?.approvalPolicy; +} + +/** Per-turn sandbox for a mode id (undefined keeps the spawned full-access). */ +export function sandboxPolicyFor( + modeId: string | undefined, +): CodexSandboxPolicy | undefined { + return CODEX_MODES.find((m) => m.id === modeId)?.sandboxPolicy; +} + +/** Named permission profile for a mode (undefined keeps the spawned default). */ +export function permissionProfileFor( + modeId: string | undefined, +): string | undefined { + return CODEX_MODES.find((m) => m.id === modeId)?.permissionProfile; +} + +/** codex collaboration mode for a preset — "plan" only for Plan, else "default". */ +export function collaborationModeFor( + modeId: string | undefined, +): "plan" | "default" { + return ( + CODEX_MODES.find((m) => m.id === modeId)?.collaborationMode ?? "default" + ); +} + +/** + * Resolve the host's initial `_meta.permissionMode` to a codex mode. A recognized + * mode is honored; anything else (e.g. "bypassPermissions") falls back to default. + */ +export function resolveInitialMode(permissionMode: string | undefined): string { + return permissionMode && CODEX_MODES.some((m) => m.id === permissionMode) + ? permissionMode + : DEFAULT_MODE; +} + +/** Codex's standard reasoning efforts; used when model/list doesn't expose them. */ +export const DEFAULT_EFFORTS = ["low", "medium", "high"]; + +// Display labels for reasoning efforts; the host renders `name` verbatim. +const EFFORT_LABELS: Record = { + low: "Low", + medium: "Medium", + high: "High", + xhigh: "Extra High", + max: "Max", +}; + +function humanizeEffort(effort: string): string { + return EFFORT_LABELS[effort] ?? effort; +} + +/** The current selector values `buildConfigOptions` projects into ACP options. */ +export interface ConfigSelectors { + /** Current permission/collaboration preset id (one of CODEX_MODES). */ + mode: string; + model: string; + effort?: string; + /** From model/list; falls back to the single current model when empty. */ + models: Array<{ id: string; name: string }>; + efforts: string[]; +} + +/** Builds the ACP configOptions (mode + model + thought_level) the host renders. */ +export function buildConfigOptions(s: ConfigSelectors): SessionConfigOption[] { + const baseModels = s.models.length + ? s.models + : [{ id: s.model, name: s.model }]; + // Ensure the active model stays selectable, else currentValue points at nothing. + const models = baseModels.some((m) => m.id === s.model) + ? baseModels + : [...baseModels, { id: s.model, name: s.model }]; + const baseEfforts = s.efforts.length ? s.efforts : DEFAULT_EFFORTS; + const currentEffort = s.effort ?? baseEfforts[0]; + const efforts = baseEfforts.includes(currentEffort) + ? baseEfforts + : [...baseEfforts, currentEffort]; + return [ + { + type: "select", + id: "mode", + name: "Mode", + category: "mode", + currentValue: s.mode, + options: CODEX_MODES.map((m) => ({ + name: m.name, + value: m.id, + description: m.description, + })), + } as unknown as SessionConfigOption, + { + type: "select", + id: "model", + name: "Model", + category: "model", + currentValue: s.model, + options: models.map((m) => ({ name: m.name, value: m.id })), + } as unknown as SessionConfigOption, + { + type: "select", + id: "effort", + name: "Reasoning effort", + category: "thought_level", + currentValue: currentEffort, + options: efforts.map((e) => ({ name: humanizeEffort(e), value: e })), + } as unknown as SessionConfigOption, + ]; +} + +/** A model entry from the app-server's `model/list` (loosely typed). */ +interface RawModel { + id?: string; + model?: string; + displayName?: string; + hidden?: boolean; + supportedReasoningEfforts?: Array<{ reasoningEffort?: string } | string>; +} + +/** + * Stateful holder for a codex session's model / effort / mode selectors and the + * ACP `configOptions` derived from them — synthesizing the Claude-style picker + * the app-server has no native concept of, rebuilt on every change. + */ +export class SessionConfigState { + private _model: string; + private _effort?: string; + private _mode = DEFAULT_MODE; + private models: Array<{ id: string; name: string }> = []; + private efforts: string[] = []; + private _options: SessionConfigOption[] = []; + + constructor(model: string, effort?: string) { + this._model = model; + this._effort = effort; + this.rebuild(); + } + + get model(): string { + return this._model; + } + get effort(): string | undefined { + return this._effort; + } + get mode(): string { + return this._mode; + } + get options(): SessionConfigOption[] { + return this._options; + } + + /** Apply the host's initial approval mode (from `_meta.permissionMode`). */ + setInitialMode(permissionMode: string | undefined): void { + this._mode = resolveInitialMode(permissionMode); + this.rebuild(); + } + + /** Apply a `setSessionConfigOption` change; returns whether the mode changed. */ + setOption( + configId: string | undefined, + value: unknown, + ): { modeChanged: boolean } { + let modeChanged = false; + if (typeof value === "string") { + if (configId === "model") this._model = value; + else if (configId === "effort") this._effort = value; + else if (configId === "mode") { + this._mode = value; + modeChanged = true; + } + } + this.rebuild(); + return { modeChanged }; + } + + /** + * Populate the model + effort selectors from a `model/list` `data` array. The + * gateway also serves Claude models, so drop non-OpenAI ones; it doesn't + * populate efforts, so fall back to the shared codex model→effort map. + */ + loadModels(rawModels: RawModel[]): void { + this.models = rawModels + .filter((m) => !m?.hidden) + .filter((m) => isOpenAIModel(m as unknown as GatewayModel)) + .map((m) => ({ + id: (m.id ?? m.model) as string, + name: (m.displayName ?? m.id ?? m.model) as string, + })); + const current = rawModels.find( + (m) => m.id === this._model || m.model === this._model, + ); + const liveEfforts = (current?.supportedReasoningEfforts ?? []) + .map((e) => (typeof e === "string" ? e : e?.reasoningEffort)) + .filter((e): e is string => typeof e === "string"); + this.efforts = liveEfforts.length + ? liveEfforts + : getReasoningEffortOptions(this._model).map((o) => o.value); + this.rebuild(); + } + + /** Reset the model/effort lists (model/list failed); keeps the current model. */ + clearModels(): void { + this.models = []; + this.efforts = []; + this.rebuild(); + } + + /** + * codex's per-turn `collaborationMode`: `{ mode, settings: { model } }`. The + * model must be a string (not the null in collaborationMode/list output). + */ + collaborationModeForTurn(): unknown { + return { + mode: collaborationModeFor(this._mode), + settings: { model: this._model }, + }; + } + + approvalPolicy(): string | undefined { + return modeApprovalPolicy(this._mode); + } + + sandboxPolicy(): CodexSandboxPolicy | undefined { + return sandboxPolicyFor(this._mode); + } + + /** Per-turn `activePermissionProfile` (codex 0.140.0's enforced sandbox), or undefined. */ + permissionProfile(): { extends: string } | undefined { + const profile = permissionProfileFor(this._mode); + return profile ? { extends: profile } : undefined; + } + + private rebuild(): void { + this._options = buildConfigOptions({ + mode: this._mode, + model: this._model, + effort: this._effort, + models: this.models, + efforts: this.efforts, + }); + } +} diff --git a/packages/agent/src/adapters/codex-app-server/spawn.test.ts b/packages/agent/src/adapters/codex-app-server/spawn.test.ts index 0be0058b4b..a0db5c3b62 100644 --- a/packages/agent/src/adapters/codex-app-server/spawn.test.ts +++ b/packages/agent/src/adapters/codex-app-server/spawn.test.ts @@ -19,13 +19,42 @@ describe("buildAppServerArgs", () => { ); }); - it("passes guidance via developer_instructions, never the replacing key", () => { + it.each([ + ["darwin", 'sandbox_mode="workspace-write"'], + ["linux", 'sandbox_mode="danger-full-access"'], + ["win32", 'sandbox_mode="danger-full-access"'], + ])( + "on %s spawns with %s (macOS keeps the sandbox engaged so read-only can restrict; cloud/linux avoids the linux-sandbox panic)", + (platform, expected) => { + const original = process.platform; + Object.defineProperty(process, "platform", { + value: platform, + configurable: true, + }); + try { + const args = buildAppServerArgs({ binaryPath: "/bundle/codex" }); + expect(args).toContain(expected); + expect(args.filter((a) => a.startsWith("sandbox_mode="))).toHaveLength( + 1, + ); + } finally { + Object.defineProperty(process, "platform", { + value: original, + configurable: true, + }); + } + }, + ); + + it("does not set instructions at spawn (developer_instructions are per-thread)", () => { const args = buildAppServerArgs({ binaryPath: "/bundle/codex", developerInstructions: "Follow PostHog rules.", }); - expect(args).toContain('developer_instructions="Follow PostHog rules."'); + expect(args.some((arg) => arg.startsWith("developer_instructions="))).toBe( + false, + ); expect(args.some((arg) => arg.startsWith("instructions="))).toBe(false); }); }); diff --git a/packages/agent/src/adapters/codex-app-server/spawn.ts b/packages/agent/src/adapters/codex-app-server/spawn.ts index 2db7a633b0..48a97d67ca 100644 --- a/packages/agent/src/adapters/codex-app-server/spawn.ts +++ b/packages/agent/src/adapters/codex-app-server/spawn.ts @@ -4,6 +4,7 @@ import { delimiter, dirname } from "node:path"; import type { Readable, Writable } from "node:stream"; import type { ProcessSpawnedCallback } from "../../types"; import { Logger } from "../../utils/logger"; +import { CodexSettingsManager } from "../codex/settings"; export interface CodexAppServerProcessOptions { /** Path to the native `codex` CLI binary (the one that exposes `app-server`). */ @@ -13,6 +14,8 @@ export interface CodexAppServerProcessOptions { apiKey?: string; /** Guidance appended to Codex's base prompt via `developer_instructions`. */ developerInstructions?: string; + /** Extra codex `-c key=value` config overrides (e.g. auto_compact_token_limit). */ + configOverrides?: Record; logger?: Logger; processCallbacks?: ProcessSpawnedCallback; } @@ -31,6 +34,27 @@ export function buildAppServerArgs( args.push("-c", "features.remote_models=false"); + // OS sandbox gated on platform (= availability): macOS Seatbelt → workspace-write + // (keeps the sandbox engaged so a per-turn readOnly can tighten it and block + // edits); linux/windows have no sandbox launcher and would panic, so + // danger-full-access (the enclosing docker/Modal sandbox isolates instead). + args.push( + "-c", + process.platform === "darwin" + ? `sandbox_mode="workspace-write"` + : `sandbox_mode="danger-full-access"`, + ); + + // Disable the user's ambient ~/.codex MCP servers so the adapter only exposes + // MCP servers PostHog injects per-thread; otherwise codex fails connecting to them. + for (const name of new CodexSettingsManager( + options.cwd ?? process.cwd(), + ).getSettings().mcpServerNames) { + // codex's `-c` parser rejects quoted/special key segments; skip such names. + if (!/^[A-Za-z0-9_-]+$/.test(name)) continue; + args.push("-c", `mcp_servers.${name}.enabled=false`); + } + if (options.apiBaseUrl) { args.push("-c", `model_provider="posthog"`); args.push("-c", `model_providers.posthog.name="PostHog Gateway"`); @@ -42,13 +66,15 @@ export function buildAppServerArgs( ); } - if (options.developerInstructions) { - const escaped = options.developerInstructions - .replace(/\\/g, "\\\\") - .replace(/\n/g, "\\n") - .replace(/\r/g, "\\r") - .replace(/"/g, '\\"'); - args.push("-c", `developer_instructions="${escaped}"`); + // developer_instructions are set per-thread in thread/start (with the host's + // task system prompt), not as a spawn-level global default. + + // Numbers/bools go bare; strings are quoted, matching codex's `-c` parser. + for (const [key, value] of Object.entries(options.configOverrides ?? {})) { + args.push( + "-c", + `${key}=${typeof value === "number" ? value : `"${value}"`}`, + ); } return args; diff --git a/packages/agent/src/adapters/codex-app-server/turn-controller.ts b/packages/agent/src/adapters/codex-app-server/turn-controller.ts new file mode 100644 index 0000000000..5192222f77 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/turn-controller.ts @@ -0,0 +1,96 @@ +import type { StopReason } from "@agentclientprotocol/sdk"; + +interface PendingTurn { + resolve: (reason: StopReason) => void; + reject: (err: Error) => void; +} + +/** + * The turn state machine for one codex thread. A turn is async: `prompt()` starts it and + * awaits a completion promise `turn/completed` (or interrupt/error) resolves. Owns the + * in-flight `turnId`, the pending completion, and the ids of interrupted turns to drop. + */ +export class TurnController { + private turnId?: string; + private pending?: PendingTurn; + private completion?: Promise; + private readonly cancelled = new Set(); + + begin(): Promise { + this.completion = new Promise((resolve, reject) => { + this.pending = { resolve, reject }; + }); + return this.completion; + } + + /** The live turn id (steer precondition / interrupt target), if a turn started. */ + get activeTurnId(): string | undefined { + return this.turnId; + } + + get isPending(): boolean { + return this.pending !== undefined; + } + + /** A turn is running AND has a turnId — i.e. it can be steered. */ + get isRunning(): boolean { + return this.pending !== undefined && this.turnId !== undefined; + } + + /** Capture the turn id from turn/started (only while a turn is pending). */ + onStarted(id: string | undefined): void { + if (this.pending && typeof id === "string") this.turnId = id; + } + + onSteered(id: string | undefined): void { + if (typeof id === "string") this.turnId = id; + } + + /** Await the in-flight turn's completion (the steer path reuses the original). */ + awaitCompletion(): Promise { + return this.completion ?? Promise.resolve("end_turn"); + } + + /** Atomically claim the pending turn (clears the slot + turnId synchronously), or undefined if already claimed. */ + claim(): PendingTurn | undefined { + const pending = this.pending; + if (!pending) return undefined; + this.pending = undefined; + this.turnId = undefined; + return pending; + } + + /** Mark the live turn interrupted (so its late completion is dropped) and return its id, or undefined. */ + markInterrupted(): string | undefined { + if (!this.turnId) return undefined; + this.cancelled.add(this.turnId); + return this.turnId; + } + + /** True (once) if this completion is for an interrupted turn we should drop. */ + shouldDropCompletion(id: string | undefined): boolean { + return id ? this.cancelled.delete(id) : false; + } + + /** Clear the pending slot after prompt() returns (covers a turn/start throw). */ + finishPrompt(): void { + this.pending = undefined; + this.completion = undefined; + } + + /** Reject the in-flight turn (e.g. the server exited before it completed). */ + fail(err: Error): void { + this.pending?.reject(err); + this.pending = undefined; + this.completion = undefined; + } + + /** Resolve and clear everything on session close. */ + close(reason: StopReason): void { + this.turnId = undefined; + this.pending?.resolve(reason); + this.pending = undefined; + this.completion = undefined; + this.cancelled.clear(); + } +} diff --git a/packages/agent/src/adapters/codex-app-server/usage-tracker.ts b/packages/agent/src/adapters/codex-app-server/usage-tracker.ts new file mode 100644 index 0000000000..ecd87ffa47 --- /dev/null +++ b/packages/agent/src/adapters/codex-app-server/usage-tracker.ts @@ -0,0 +1,88 @@ +import { + type ContextBreakdownBaseline, + emptyBaseline, +} from "../claude/context-breakdown"; +import type { AccumulatedUsage } from "./ext-notifications"; + +/** The live `_posthog/usage_update` fields (context-window occupancy). */ +export interface UsageUpdate { + used: number; + size: number | null; + usage: { + inputTokens?: number; + outputTokens?: number; + cachedReadTokens?: number; + reasoningTokens?: number; + totalTokens?: number; + }; +} + +/** + * Tracks token usage for one codex thread. codex's `thread/tokenUsage/updated` carries + * `{ total, last, modelContextWindow }`; `last` drives both context occupancy and per-turn + * usage rather than diffing `total` (a fallback for builds predating `last`). + */ +export class UsageTracker { + private baseline: ContextBreakdownBaseline = emptyBaseline(); + private lastTurn?: AccumulatedUsage; + private contextUsed?: number; + + setBaseline(baseline: ContextBreakdownBaseline): void { + this.baseline = baseline; + } + + get baselineBreakdown(): ContextBreakdownBaseline { + return this.baseline; + } + + /** Zero the per-turn view at turn start so a token-less turn reports 0. */ + resetForTurn(): void { + this.lastTurn = undefined; + this.contextUsed = undefined; + } + + /** Ingest a `thread/tokenUsage/updated` payload; returns the live usage_update, or null if unusable. */ + ingest(params: unknown): UsageUpdate | null { + const tu = (params as { tokenUsage?: any })?.tokenUsage; + const total = tu?.total; + if (!total) return null; + const context = tu.last ?? total; + // Drives the per-source breakdown's "conversation" bucket on turn complete. + this.contextUsed = context.inputTokens ?? context.totalTokens; + this.lastTurn = { + inputTokens: context.inputTokens ?? 0, + outputTokens: context.outputTokens ?? 0, + cachedReadTokens: context.cachedInputTokens ?? 0, + // codex's TokenUsageBreakdown has no cache-write field; 0 is authoritative. + cachedWriteTokens: 0, + }; + return { + used: context.totalTokens, + size: tu.modelContextWindow ?? null, + usage: { + inputTokens: context.inputTokens, + outputTokens: context.outputTokens, + cachedReadTokens: context.cachedInputTokens, + reasoningTokens: context.reasoningOutputTokens, + totalTokens: context.totalTokens, + }, + }; + } + + /** Per-turn usage for `_posthog/turn_complete` — codex's `last`, not a delta. */ + perTurnUsage(): AccumulatedUsage { + return ( + this.lastTurn ?? { + inputTokens: 0, + outputTokens: 0, + cachedReadTokens: 0, + cachedWriteTokens: 0, + } + ); + } + + /** Live context occupancy (last turn's input tokens), or undefined pre-usage. */ + contextTokens(): number | undefined { + return this.contextUsed; + } +} diff --git a/packages/agent/src/adapters/codex/spawn.ts b/packages/agent/src/adapters/codex/spawn.ts index 9e14e1a8cd..c023b31126 100644 --- a/packages/agent/src/adapters/codex/spawn.ts +++ b/packages/agent/src/adapters/codex/spawn.ts @@ -25,6 +25,12 @@ export interface CodexProcessOptions { settings?: CodexSettings; /** Additional writable roots passed to Codex's workspace-write sandbox. */ additionalDirectories?: string[]; + /** + * Extra codex `-c key=value` config overrides (app-server sub-adapter only). + * An escape hatch for config the adapter doesn't model — e.g. the e2e sets + * `auto_compact_token_limit` low to force a compaction. + */ + configOverrides?: Record; } export interface CodexProcess { @@ -39,6 +45,14 @@ function buildConfigArgs(options: CodexProcessOptions): string[] { args.push("-c", `features.remote_models=false`); + // The agent already runs inside PostHog's isolated sandbox (docker/Modal with + // agentsh egress + filesystem controls), so Codex's own OS-level sandbox is + // redundant — and its `linux-sandbox` launcher is unavailable inside that + // sandbox, so the default workspace-write mode panics ("sandbox launcher + // unavailable" → require_escalated) and wedges the session. Run Codex with no + // nested sandbox; the enclosing sandbox provides the isolation. + args.push("-c", `sandbox_mode="danger-full-access"`); + // Disable the user's local MCPs one-by-one so Codex only uses the MCPs we // provide via ACP. We can't use `-c mcp_servers={}` because that makes Codex // ignore MCPs entirely, including the ones we inject later. diff --git a/packages/agent/src/agent.ts b/packages/agent/src/agent.ts index 28d26d627c..85c78eab47 100644 --- a/packages/agent/src/agent.ts +++ b/packages/agent/src/agent.ts @@ -129,6 +129,7 @@ export class Agent { logger: this.logger, processCallbacks: options.processCallbacks, onStructuredOutput: options.onStructuredOutput, + useCodexAppServer: options.useCodexAppServer, allowedModelIds, posthogApiConfig: this.posthogApiConfig, enricherEnabled: this.enricherEnabled, diff --git a/packages/agent/src/execution-mode.test.ts b/packages/agent/src/execution-mode.test.ts index be59649062..669715968e 100644 --- a/packages/agent/src/execution-mode.test.ts +++ b/packages/agent/src/execution-mode.test.ts @@ -12,8 +12,9 @@ describe("execution modes", () => { ]); }); - it("includes full access for codex sessions", () => { + it("exposes the same presets as a live codex session (incl. plan)", () => { expect(getAvailableCodexModes().map((mode) => mode.id)).toEqual([ + "plan", "read-only", "auto", "full-access", diff --git a/packages/agent/src/execution-mode.ts b/packages/agent/src/execution-mode.ts index 99f6799183..c90925e631 100644 --- a/packages/agent/src/execution-mode.ts +++ b/packages/agent/src/execution-mode.ts @@ -73,7 +73,16 @@ export function isCodexNativeMode(mode: string): mode is CodexNativeMode { return (CODEX_NATIVE_MODES as readonly string[]).includes(mode); } +// Mirrors the codex app-server adapter's CODEX_MODES (session-config.ts) so the +// task-creation picker offers the same presets as a live session. "plan" is a +// valid CodeExecutionMode that codex-acp maps to read-only, and the app-server +// gives it a read-only sandbox — so it is safe on both sub-adapters. const codexModes: ModeInfo[] = [ + { + id: "plan", + name: "Plan", + description: "Plan first — inspect and propose; makes no changes", + }, { id: "read-only", name: "Read Only", diff --git a/packages/agent/src/server/agent-server.ts b/packages/agent/src/server/agent-server.ts index 5d64177928..9f854db6c0 100644 --- a/packages/agent/src/server/agent-server.ts +++ b/packages/agent/src/server/agent-server.ts @@ -395,7 +395,14 @@ export class AgentServer { } private shouldRelayPermissionToClient(mode: PermissionMode): boolean { - return mode === "default" || mode === "auto" || mode === "read-only"; + // "plan" relays like "read-only" (look-don't-touch): escalations need a human + // veto, not silent auto-approval. + return ( + mode === "default" || + mode === "auto" || + mode === "read-only" || + mode === "plan" + ); } private createApp(): Hono { @@ -1156,6 +1163,11 @@ export class AgentServer { cwd: this.config.repositoryPath ?? "/tmp/workspace", apiBaseUrl: gatewayEnv.openaiBaseUrl, apiKey: this.config.apiKey, + // Path to the bundled codex-acp binary; the native app-server + // adapter derives `codex` from the same directory. Set in the + // sandbox image (POSTHOG_CODEX_BINARY_PATH); when unset the + // adapter falls back to npx codex-acp. + binaryPath: process.env.POSTHOG_CODEX_BINARY_PATH, model: this.config.model ?? DEFAULT_CODEX_MODEL, reasoningEffort: this.config.reasoningEffort, developerInstructions: codexInstructions, @@ -2915,9 +2927,13 @@ ${signedCommitInstructions} isQuestion || this.shouldRelayPermissionToClient(sessionPermissionMode); + // A background run has no human to answer a relayed approval + // (hasDesktopConnected is true from the event-relay reader), so + // auto-approve rather than hang on it. if ( - isPlanApproval || - (needsDesktopApproval && this.session?.hasDesktopConnected) + mode !== "background" && + (isPlanApproval || + (needsDesktopApproval && this.session?.hasDesktopConnected)) ) { this.logger.debug("Relaying permission request", { kind: params.toolCall?.kind, diff --git a/packages/agent/src/types.ts b/packages/agent/src/types.ts index 0056590678..d5c10e6169 100644 --- a/packages/agent/src/types.ts +++ b/packages/agent/src/types.ts @@ -65,6 +65,12 @@ export interface TaskExecutionOptions { onStructuredOutput?: (output: Record) => Promise; /** Additional directories the agent process can access beyond cwd. */ additionalDirectories?: string[]; + /** + * Codex-only feature-flag lever: `true` selects the native app-server adapter, + * `false` codex-acp. The host evaluates a PostHog flag and passes the result; + * undefined falls back to env overrides then the bundled-binary default. + */ + useCodexAppServer?: boolean; } export type LogLevel = "debug" | "info" | "warn" | "error"; diff --git a/packages/agent/vitest.e2e.config.ts b/packages/agent/vitest.e2e.config.ts new file mode 100644 index 0000000000..01e95543af --- /dev/null +++ b/packages/agent/vitest.e2e.config.ts @@ -0,0 +1,24 @@ +import { resolve } from "node:path"; +import { defineConfig } from "vitest/config"; + +// Live, opt-in e2e suite. Separate from the default `vitest.config.ts` (which +// only includes `src/**`), so these never run under `pnpm test` or in CI — only +// via `pnpm test:e2e`. Sequential, generous timeouts: each test drives two real +// model turns end to end. +export default defineConfig({ + resolve: { + alias: { + "@": resolve(__dirname, "src"), + }, + }, + test: { + globals: true, + environment: "node", + include: ["e2e/**/*.e2e.test.ts"], + exclude: ["**/node_modules/**", "**/dist/**"], + isolate: true, + fileParallelism: false, + testTimeout: 300_000, + hookTimeout: 120_000, + }, +}); diff --git a/packages/core/src/sessions/cloudSessionConfig.test.ts b/packages/core/src/sessions/cloudSessionConfig.test.ts index d0712992ec..8bcf537ba7 100644 --- a/packages/core/src/sessions/cloudSessionConfig.test.ts +++ b/packages/core/src/sessions/cloudSessionConfig.test.ts @@ -61,7 +61,8 @@ describe("buildCloudDefaultConfigOptions", () => { it.each([ { initialMode: "auto", expected: "auto" }, { initialMode: "full-access", expected: "full-access" }, - { initialMode: "plan", expected: "auto" }, + // plan is now a valid codex preset (mirrors the app-server), so it's kept. + { initialMode: "plan", expected: "plan" }, { initialMode: "default", expected: "auto" }, ])( "validates codex initial mode $initialMode", diff --git a/packages/core/src/sessions/contextUsage.test.ts b/packages/core/src/sessions/contextUsage.test.ts index 4280d146c5..fb295c5818 100644 --- a/packages/core/src/sessions/contextUsage.test.ts +++ b/packages/core/src/sessions/contextUsage.test.ts @@ -56,6 +56,27 @@ describe("extractContextUsage", () => { expect(result?.breakdown).toBeNull(); }); + it("surfaces token count even when the context window size is unknown", () => { + // codex omits `size` when the protocol has no modelContextWindow — the + // aggregate must still render (size 0, no percentage) rather than vanish. + const event: AcpMessage = { + type: "acp_message", + ts: 1, + message: { + jsonrpc: "2.0", + method: "session/update", + params: { + sessionId: "s1", + update: { sessionUpdate: "usage_update", used: 50_000 }, + }, + }, + }; + const result = extractContextUsage([event]); + expect(result?.used).toBe(50_000); + expect(result?.size).toBe(0); + expect(result?.percentage).toBe(0); + }); + it("merges breakdown from a _posthog/usage_update notification", () => { const result = extractContextUsage([ usageUpdateEvent(50_000, 200_000), diff --git a/packages/core/src/sessions/contextUsage.ts b/packages/core/src/sessions/contextUsage.ts index fb59a55060..22f33280e9 100644 --- a/packages/core/src/sessions/contextUsage.ts +++ b/packages/core/src/sessions/contextUsage.ts @@ -82,16 +82,18 @@ function extractAggregate( const update = params?.update; if ( update?.sessionUpdate === "usage_update" && - typeof update.used === "number" && - typeof update.size === "number" + typeof update.used === "number" ) { + // The model context window (`size`) may be unknown — e.g. codex omits it + // when the protocol doesn't report `modelContextWindow`. Still surface the + // raw token count (size 0 → the indicator shows used tokens, no + // percentage) rather than dropping the whole aggregate. + const size = typeof update.size === "number" ? update.size : 0; const percentage = - update.size > 0 - ? Math.min(100, Math.round((update.used / update.size) * 100)) - : 0; + size > 0 ? Math.min(100, Math.round((update.used / size) * 100)) : 0; return { used: update.used, - size: update.size, + size, percentage, cost: update.cost ?? null, }; diff --git a/packages/core/src/sessions/executionModes.ts b/packages/core/src/sessions/executionModes.ts index 8d471d44f1..dc36a4a6a8 100644 --- a/packages/core/src/sessions/executionModes.ts +++ b/packages/core/src/sessions/executionModes.ts @@ -32,7 +32,15 @@ const availableModes: ModeInfo[] = [ }, ]; +// Mirrors the codex app-server adapter's CODEX_MODES so the picker offers the +// same presets as a live session. "plan" is a CodeExecutionMode codex-acp maps +// to read-only and the app-server gives a read-only sandbox — safe on both. const codexModes: ModeInfo[] = [ + { + id: "plan", + name: "Plan", + description: "Plan first — inspect and propose; makes no changes", + }, { id: "read-only", name: "Read Only", diff --git a/packages/core/src/sessions/sessionService.ts b/packages/core/src/sessions/sessionService.ts index 36b18a1bd6..1c44ae604e 100644 --- a/packages/core/src/sessions/sessionService.ts +++ b/packages/core/src/sessions/sessionService.ts @@ -26,7 +26,9 @@ import { type OptimisticItem, type PermissionRequest, type QueuedMessage, + resolveBypassRevertMode, type StoredLogEntry, + sessionSupportsNativeSteer, type TaskRunStatus, } from "@posthog/shared"; import { ANALYTICS_EVENTS } from "@posthog/shared/analytics-events"; @@ -252,6 +254,13 @@ export interface SessionServiceHelpers { ) => Promise; } +/** + * PostHog flag gating the native codex app-server sub-adapter. When enabled for + * the user, a codex session uses the app-server adapter instead of codex-acp. + * Resolved at session start and passed to the agent as `useCodexAppServer`. + */ +export const CODEX_APP_SERVER_FLAG = "codex-app-server"; + export interface SessionServiceDeps { trpc: SessionTrpc; store: ISessionStore; @@ -267,6 +276,12 @@ export interface SessionServiceDeps { info: (msg: any, opts?: any) => unknown; }; track: (event: string, props?: Record) => void; + /** + * Evaluates a PostHog feature flag for the current user. Used to resolve + * {@link CODEX_APP_SERVER_FLAG} at session start. Optional so non-desktop + * hosts (stubbed web, tests) can omit it — absent is treated as "flag off". + */ + featureFlags?: { isEnabled(flagKey: string): boolean }; buildPermissionToolMetadata: (...args: any[]) => any; notifyPermissionRequest: (...args: any[]) => any; notifyPromptComplete: (...args: any[]) => any; @@ -954,6 +969,7 @@ export class SessionService { logUrl, sessionId, adapter: resolvedAdapter, + useCodexAppServer: this.resolveUseCodexAppServer(resolvedAdapter), permissionMode: persistedMode, model: persistedModel, customInstructions: customInstructions || undefined, @@ -978,6 +994,7 @@ export class SessionService { this.d.store.updateSession(taskRunId, { status: "connected", configOptions, + steering: (result as { steering?: string }).steering, }); // Persist the merged config options @@ -1245,6 +1262,26 @@ export class SessionService { ); } + /** + * Resolve the `codex-app-server` flag for a session. Only meaningful for the + * codex adapter (Claude ignores it), so returns undefined otherwise. + * + * One-way opt-in: when the flag is ON we force the app-server adapter (`true`). + * When off/unloaded (or no flags service on non-desktop hosts) we return + * `undefined` rather than `false`, so the agent falls through to its env + * override (`POSTHOG_CODEX_USE_APP_SERVER`) and then the codex-acp default — + * hard-passing `false` would shadow that env, since the host value has the + * highest precedence in resolveUseCodexAppServer. + */ + private resolveUseCodexAppServer( + adapter: "claude" | "codex" | undefined, + ): boolean | undefined { + if (adapter !== "codex") return undefined; + return this.d.featureFlags?.isEnabled(CODEX_APP_SERVER_FLAG) + ? true + : undefined; + } + private async createNewLocalSession( taskId: string, taskTitle: string, @@ -1277,6 +1314,7 @@ export class SessionService { projectId: auth.projectId, permissionMode: executionMode, adapter, + useCodexAppServer: this.resolveUseCodexAppServer(adapter), customInstructions: startCustomInstructions || undefined, effort: effortLevelSchema.safeParse(reasoningLevel).success ? (reasoningLevel as EffortLevel) @@ -1312,6 +1350,7 @@ export class SessionService { | SessionConfigOption[] | undefined; session.configOptions = configOptions; + session.steering = (result as { steering?: string }).steering; // Persist the config options if (configOptions) { @@ -2156,22 +2195,18 @@ export class SessionService { } // Steer: the user sent a message mid-turn and asked to fold it into the - // running turn rather than queue it. Native (Claude, local) injects at the - // next tool boundary; local Codex interrupts the turn and resends below as - // a fresh prompt. - // - // Cloud has no real mid-turn steer: the backend only delivers user messages - // between turns, so a cloud "steer" would cancel the running turn for no - // gain (the message lands next turn either way) while surfacing a jarring - // interruption. Until the backend supports true steering, cloud steer falls - // through to the queue like a normal message. Compaction also falls through. + // running turn rather than queue it. Adapters that negotiated + // `steering: "native"` (Claude, codex app-server) inject at the next tool + // boundary; codex-acp ("interrupt-resend") and unknown adapters cancel and + // resend. Cloud has no real mid-turn steer (the backend only delivers + // messages between turns), so it falls through to the queue; compaction too. if ( options?.steer && !session.isCloud && session.isPromptPending && !session.isCompacting ) { - if (session.adapter === "claude") { + if (sessionSupportsNativeSteer(session)) { return this.sendSteerPrompt(session, prompt); } await this.cancelPrompt(taskId); @@ -4547,6 +4582,7 @@ export class SessionService { isCloud: boolean; allowBypassPermissions: boolean; currentModeId: string | boolean | undefined; + modeOption: SessionConfigOption | undefined; }, ): void { if (options.allowBypassPermissions) return; @@ -4555,7 +4591,9 @@ export class SessionService { options.currentModeId === "bypassPermissions" || options.currentModeId === "full-access"; if (!isBypass || !taskId) return; - this.setSessionConfigOptionByCategory(taskId, "mode", "default"); + const target = resolveBypassRevertMode(options.modeOption); + if (!target) return; + this.setSessionConfigOptionByCategory(taskId, "mode", target); } /** diff --git a/packages/shared/src/index.ts b/packages/shared/src/index.ts index c8516c74ec..967c6eb9ca 100644 --- a/packages/shared/src/index.ts +++ b/packages/shared/src/index.ts @@ -184,7 +184,9 @@ export { type OptimisticItem, type PermissionRequest, type QueuedMessage, + resolveBypassRevertMode, type SessionStatus, + sessionSupportsNativeSteer, } from "./sessions"; export type { SignalReportOrderingField, @@ -217,6 +219,15 @@ export { formatRelativeTimeShort, getRelativeDateGroup, } from "./time"; +export { + mcpToolKey, + type PosthogToolMeta, + parseMcpToolName, + posthogToolMeta, + readAgentToolName, + readMcpToolDescriptor, + readMcpToolName, +} from "./tool-meta"; export { TypedEventEmitter } from "./typed-event-emitter"; export { isSafeExternalUrl } from "./url"; export { getCloudUrlFromRegion } from "./urls"; diff --git a/packages/shared/src/sessions.test.ts b/packages/shared/src/sessions.test.ts new file mode 100644 index 0000000000..85d736fbc3 --- /dev/null +++ b/packages/shared/src/sessions.test.ts @@ -0,0 +1,112 @@ +import type { SessionConfigOption } from "@agentclientprotocol/sdk"; +import { describe, expect, it } from "vitest"; +import { + type AgentSession, + resolveBypassRevertMode, + sessionSupportsNativeSteer, +} from "./sessions"; + +function modeOption( + values: string[], + currentValue: string, +): SessionConfigOption { + return { + type: "select", + id: "mode", + name: "Mode", + category: "mode", + currentValue, + options: values.map((v) => ({ name: v, value: v })), + } as unknown as SessionConfigOption; +} + +describe("resolveBypassRevertMode", () => { + it("reverts a claude session to 'default'", () => { + const opt = modeOption( + ["default", "acceptEdits", "plan", "bypassPermissions"], + "bypassPermissions", + ); + expect(resolveBypassRevertMode(opt)).toBe("default"); + }); + + it("reverts a codex session to 'auto', never the claude-only 'default'", () => { + const opt = modeOption( + ["plan", "read-only", "auto", "full-access"], + "full-access", + ); + const target = resolveBypassRevertMode(opt); + expect(target).toBe("auto"); + expect(target).not.toBe("default"); + }); + + it("falls back to the first non-bypass option when neither default nor auto exist", () => { + expect( + resolveBypassRevertMode( + modeOption(["read-only", "full-access"], "full-access"), + ), + ).toBe("read-only"); + }); + + it("returns undefined for a missing or non-select option", () => { + expect(resolveBypassRevertMode(undefined)).toBeUndefined(); + expect( + resolveBypassRevertMode({ + type: "boolean", + } as unknown as SessionConfigOption), + ).toBeUndefined(); + }); +}); + +describe("sessionSupportsNativeSteer", () => { + type Case = Pick; + + it.each<[string, Case, boolean]>([ + // Capability-driven: "native" folds the message into the running turn. + [ + "claude advertises native", + { isCloud: false, steering: "native", adapter: "claude" }, + true, + ], + [ + "codex app-server advertises native", + { isCloud: false, steering: "native", adapter: "codex" }, + true, + ], + // codex-acp advertises "interrupt-resend" — must NOT steer natively. + [ + "codex-acp interrupt-resend", + { isCloud: false, steering: "interrupt-resend", adapter: "codex" }, + false, + ], + // Fallback: pre-capability start paths leave steering unset; never regress claude. + [ + "claude with no capability (fallback)", + { isCloud: false, steering: undefined, adapter: "claude" }, + true, + ], + [ + "codex with no capability (no fallback)", + { isCloud: false, steering: undefined, adapter: "codex" }, + false, + ], + // An explicit non-native capability overrides the claude fallback. + [ + "claude explicitly non-native", + { isCloud: false, steering: "interrupt-resend", adapter: "claude" }, + false, + ], + // Cloud runs queue/resend; they never steer locally regardless of capability. + [ + "cloud claude native", + { isCloud: true, steering: "native", adapter: "claude" }, + false, + ], + [ + "cloud codex native", + { isCloud: true, steering: "native", adapter: "codex" }, + false, + ], + ])("%s", (_label, session, expected) => { + expect(sessionSupportsNativeSteer(session)).toBe(expected); + }); +}); diff --git a/packages/shared/src/sessions.ts b/packages/shared/src/sessions.ts index 0724dddfac..a278771721 100644 --- a/packages/shared/src/sessions.ts +++ b/packages/shared/src/sessions.ts @@ -65,6 +65,13 @@ export interface AgentSession { framework?: "claude"; adapter?: Adapter; configOptions?: SessionConfigOption[]; + /** + * Adapter's negotiated steering capability (`_meta.posthog.steering` from + * initialize). "native" means a mid-turn message folds into the running turn + * (claude, codex app-server); "interrupt-resend" (codex-acp) or undefined + * means the host must cancel + resend. Drives the steer-vs-resend decision. + */ + steering?: string; pendingPermissions: Map; pausedDurationMs: number; messageQueue: QueuedMessage[]; @@ -160,3 +167,41 @@ export function getCurrentModeFromConfigOptions( const modeOption = getConfigOptionByCategory(configOptions, "mode"); return modeOption?.currentValue as ExecutionMode | undefined; } + +/** + * The safe non-bypass mode to revert to when "Bypass permissions" is turned + * off, chosen from the session's OWN mode options so it's always valid for that + * adapter. Claude exposes "default"; codex has no "default" (its presets are + * plan/read-only/auto/full-access) so it falls back to "auto" — reverting codex + * to "default" would set an unknown mode (no approvalPolicy → an undefined + * approval state). Returns undefined when there is no usable mode option. + */ +export function resolveBypassRevertMode( + modeOption: SessionConfigOption | undefined, +): string | undefined { + if (modeOption?.type !== "select") return undefined; + const opts = flattenSelectOptions(modeOption.options); + const isBypass = (v: string) => + v === "bypassPermissions" || v === "full-access"; + if (opts.some((o) => o.value === "default")) return "default"; + if (opts.some((o) => o.value === "auto")) return "auto"; + return opts.find((o) => !isBypass(o.value))?.value; +} + +/** + * Whether a mid-turn message can be folded into the running turn (steered) + * rather than interrupt-and-resent. Decided by the adapter's negotiated + * `steering` capability: "native" folds (claude, codex app-server); + * "interrupt-resend" (codex-acp) does not. Cloud runs never steer locally. + * + * Fallback: if `steering` is unset (a start path that predates capability + * plumbing), Claude is still treated as native — it has always steered — so the + * capability rollout can never regress it. + */ +export function sessionSupportsNativeSteer( + session: Pick, +): boolean { + if (session.isCloud) return false; + if (session.steering === "native") return true; + return session.steering == null && session.adapter === "claude"; +} diff --git a/packages/shared/src/tool-meta.test.ts b/packages/shared/src/tool-meta.test.ts new file mode 100644 index 0000000000..8e718d8616 --- /dev/null +++ b/packages/shared/src/tool-meta.test.ts @@ -0,0 +1,82 @@ +import { describe, expect, it } from "vitest"; +import { + parseMcpToolName, + readAgentToolName, + readMcpToolDescriptor, + readMcpToolName, +} from "./tool-meta"; + +describe("parseMcpToolName", () => { + it("splits the first __ after the prefix as the server boundary", () => { + expect(parseMcpToolName("mcp__posthog__exec")).toEqual({ + server: "posthog", + tool: "exec", + }); + }); + + it("keeps single underscores inside server and tool names", () => { + expect( + parseMcpToolName("mcp__plugin_posthog_posthog__execute-sql"), + ).toEqual({ server: "plugin_posthog_posthog", tool: "execute-sql" }); + }); + + it("returns undefined for non-MCP or malformed names", () => { + expect(parseMcpToolName("Bash")).toBeUndefined(); + expect(parseMcpToolName("mcp__posthog__")).toBeUndefined(); + expect(parseMcpToolName("mcp____exec")).toBeUndefined(); + }); +}); + +describe("readAgentToolName", () => { + it("prefers the posthog channel over the legacy claudeCode fallback", () => { + expect( + readAgentToolName({ + posthog: { toolName: "mcp__posthog__exec" }, + claudeCode: { toolName: "stale" }, + }), + ).toBe("mcp__posthog__exec"); + }); + + it("falls back to claudeCode when posthog is absent", () => { + expect(readAgentToolName({ claudeCode: { toolName: "Bash" } })).toBe( + "Bash", + ); + }); + + it("returns undefined for non-tool meta", () => { + expect(readAgentToolName(undefined)).toBeUndefined(); + expect(readAgentToolName({})).toBeUndefined(); + }); +}); + +describe("readMcpToolDescriptor / readMcpToolName", () => { + it("uses the structured mcp descriptor when present (no name parsing)", () => { + const meta = { + posthog: { + toolName: "ignored", + mcp: { server: "posthog", tool: "exec" }, + }, + }; + expect(readMcpToolDescriptor(meta)).toEqual({ + server: "posthog", + tool: "exec", + }); + expect(readMcpToolName(meta)).toBe("mcp__posthog__exec"); + }); + + it("parses the legacy claudeCode mcp__ name when there is no structured channel", () => { + const meta = { claudeCode: { toolName: "mcp__posthog__execute-sql" } }; + expect(readMcpToolDescriptor(meta)).toEqual({ + server: "posthog", + tool: "execute-sql", + }); + expect(readMcpToolName(meta)).toBe("mcp__posthog__execute-sql"); + }); + + it("returns undefined for non-MCP tool calls", () => { + expect( + readMcpToolDescriptor({ claudeCode: { toolName: "Bash" } }), + ).toBeUndefined(); + expect(readMcpToolName({ posthog: { toolName: "Bash" } })).toBeUndefined(); + }); +}); diff --git a/packages/shared/src/tool-meta.ts b/packages/shared/src/tool-meta.ts new file mode 100644 index 0000000000..8ef62cc324 --- /dev/null +++ b/packages/shared/src/tool-meta.ts @@ -0,0 +1,82 @@ +/** + * Canonical, harness-neutral tool metadata carried on an ACP tool call's + * `_meta.posthog`. Each adapter (the native-protocol → ACP boundary) populates + * it, so the renderer never has to know which harness produced a tool call. + * + * The renderer reads through {@link readAgentToolName} / {@link readMcpToolName}, + * which prefer this channel and fall back to the legacy `_meta.claudeCode.toolName` + * the Claude adapter still writes. New adapters should only populate `posthog`. + */ +export interface PosthogToolMeta { + /** Agent-facing tool name, e.g. "Bash" or "mcp__posthog__exec". */ + toolName: string; + /** Set only for MCP tool calls — the originating server + tool. */ + mcp?: { server: string; tool: string }; +} + +/** `_meta` fragment for adapters to spread onto a tool_call update. */ +export function posthogToolMeta(meta: PosthogToolMeta): { + posthog: PosthogToolMeta; +} { + return { posthog: meta }; +} + +/** Build the canonical `mcp____` key. */ +export function mcpToolKey(mcp: { server: string; tool: string }): string { + return `mcp__${mcp.server}__${mcp.tool}`; +} + +/** + * Parse a `mcp____` name into its parts; undefined when the name + * isn't MCP-shaped. The server segment never contains `__`, so the first `__` + * after the prefix terminates it and the remainder is the tool. + */ +export function parseMcpToolName( + toolName: string, +): { server: string; tool: string } | undefined { + const PREFIX = "mcp__"; + if (!toolName.startsWith(PREFIX)) return undefined; + const rest = toolName.slice(PREFIX.length); + const sep = rest.indexOf("__"); + if (sep <= 0 || sep + 2 >= rest.length) return undefined; + return { server: rest.slice(0, sep), tool: rest.slice(sep + 2) }; +} + +interface ToolCallMeta { + posthog?: PosthogToolMeta; + /** Legacy Claude-adapter channel, read only as a fallback. */ + claudeCode?: { toolName?: string }; +} + +function asToolCallMeta(meta: unknown): ToolCallMeta | undefined { + return meta && typeof meta === "object" ? (meta as ToolCallMeta) : undefined; +} + +/** Canonical agent-facing tool name: neutral channel first, legacy fallback. */ +export function readAgentToolName(meta: unknown): string | undefined { + const m = asToolCallMeta(meta); + return m?.posthog?.toolName ?? m?.claudeCode?.toolName; +} + +/** + * The MCP `{ server, tool }` descriptor for a tool call, or undefined for a + * non-MCP call. Prefers the structured channel, else parses the legacy + * `mcp__…` name. + */ +export function readMcpToolDescriptor( + meta: unknown, +): { server: string; tool: string } | undefined { + const m = asToolCallMeta(meta); + if (m?.posthog?.mcp) return m.posthog.mcp; + const name = m?.posthog?.toolName ?? m?.claudeCode?.toolName; + return name ? parseMcpToolName(name) : undefined; +} + +/** + * Canonical `mcp__server__tool` key for a tool call, or undefined for a non-MCP + * call. Convenience for components still keyed on the string form. + */ +export function readMcpToolName(meta: unknown): string | undefined { + const mcp = readMcpToolDescriptor(meta); + return mcp ? mcpToolKey(mcp) : undefined; +} diff --git a/packages/ui/src/features/message-editor/components/PromptInput.test.tsx b/packages/ui/src/features/message-editor/components/PromptInput.test.tsx new file mode 100644 index 0000000000..bd0a282946 --- /dev/null +++ b/packages/ui/src/features/message-editor/components/PromptInput.test.tsx @@ -0,0 +1,139 @@ +import { Theme } from "@radix-ui/themes"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import type React from "react"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const editorState = vi.hoisted(() => ({ isEmpty: false })); +const settingsState = vi.hoisted(() => ({ slotMachineMode: false })); + +vi.mock("../tiptap/useTiptapEditor", () => ({ + useTiptapEditor: () => ({ + editor: null, + isReady: true, + isEmpty: editorState.isEmpty, + isBashMode: false, + submit: vi.fn(), + focus: vi.fn(), + blur: vi.fn(), + clear: vi.fn(), + getText: vi.fn(), + getContent: vi.fn(), + setContent: vi.fn(), + insertChip: vi.fn(), + removeChipById: vi.fn(), + replaceChipAttrs: vi.fn(), + attachments: [], + addAttachment: vi.fn(), + removeAttachment: vi.fn(), + }), +})); + +vi.mock("@posthog/ui/features/settings/settingsStore", () => ({ + useSettingsStore: (selector: (s: typeof settingsState) => unknown) => + selector(settingsState), +})); + +vi.mock("../../skills/useSkills", () => ({ + useSkills: () => ({ data: [] }), +})); + +vi.mock("../draftStore", () => ({ + useDraftStore: Object.assign( + (selector: (s: unknown) => unknown) => + selector({ focusRequested: {}, actions: { clearFocusRequest: vi.fn() } }), + { + getState: () => ({ + actions: { setCommands: vi.fn(), clearCommands: vi.fn() }, + }), + }, + ), +})); + +vi.mock("./AttachmentMenu", () => ({ AttachmentMenu: () => null })); +vi.mock("./AttachmentsBar", () => ({ AttachmentsBar: () => null })); +vi.mock("./SlotMachineSubmit", () => ({ + SlotMachineSubmit: ({ + disabled, + onSubmit, + }: { + disabled?: boolean; + onSubmit?: () => void; + }) => ( + + ), +})); + +import { PromptInput } from "./PromptInput"; + +function renderInput(props: Partial>) { + return render( + + + , + ); +} + +describe("PromptInput submit/stop affordance", () => { + beforeEach(() => { + vi.clearAllMocks(); + editorState.isEmpty = false; + settingsState.slotMachineMode = false; + }); + + it("shows Stop (not Send) while loading and calls onCancel when clicked", async () => { + const user = userEvent.setup(); + const onCancel = vi.fn(); + + renderInput({ isLoading: true, onCancel }); + + const stop = screen.getByRole("button", { name: "Stop" }); + expect( + screen.queryByRole("button", { name: "Send message" }), + ).not.toBeInTheDocument(); + + await user.click(stop); + expect(onCancel).toHaveBeenCalledOnce(); + }); + + it("keeps Send enabled mid-turn when no cancel handler (queue/steer path)", () => { + // isLoading true but no onCancel => inStopMode is false, so the composer + // must still expose an enabled Send so messages queue/steer mid-turn. + // Regression guard: adding `|| isLoading` to submitBlocked disables this. + renderInput({ isLoading: true }); + + const send = screen.getByRole("button", { name: "Send message" }); + expect(send).toBeEnabled(); + }); + + it("disables Send when the editor is empty", () => { + editorState.isEmpty = true; + + renderInput({}); + + const send = screen.getByRole("button", { name: "Send message" }); + expect(send).toBeDisabled(); + }); +}); diff --git a/packages/ui/src/features/permissions/McpPermission.tsx b/packages/ui/src/features/permissions/McpPermission.tsx index 9b89780d3d..7fbac089aa 100644 --- a/packages/ui/src/features/permissions/McpPermission.tsx +++ b/packages/ui/src/features/permissions/McpPermission.tsx @@ -1,3 +1,4 @@ +import { readMcpToolName } from "@posthog/shared"; import { parseMcpToolKey } from "@posthog/ui/features/mcp-apps/utils/mcp-app-host-utils"; import { formatPosthogExecBody, @@ -16,9 +17,7 @@ export function McpPermission({ onSelect, onCancel, }: BasePermissionProps) { - const mcpToolName = ( - toolCall._meta as { claudeCode?: { toolName?: string } } | undefined - )?.claudeCode?.toolName; + const mcpToolName = readMcpToolName(toolCall._meta); if (!mcpToolName) { return ( diff --git a/packages/ui/src/features/permissions/PermissionSelector.tsx b/packages/ui/src/features/permissions/PermissionSelector.tsx index b89ad00d02..3030a9d56c 100644 --- a/packages/ui/src/features/permissions/PermissionSelector.tsx +++ b/packages/ui/src/features/permissions/PermissionSelector.tsx @@ -1,4 +1,5 @@ import type { PermissionOption } from "@agentclientprotocol/sdk"; +import { readMcpToolName } from "@posthog/shared"; import { DefaultPermission } from "./DefaultPermission"; import { DeletePermission } from "./DeletePermission"; import { EditPermission } from "./EditPermission"; @@ -31,11 +32,8 @@ export function PermissionSelector({ onCancel, }: PermissionSelectorProps) { const props = { toolCall, options, onSelect, onCancel }; - const meta = toolCall._meta as - | { codeToolKind?: string; claudeCode?: { toolName?: string } } - | undefined; - const agentToolName = meta?.claudeCode?.toolName; - if (agentToolName?.startsWith("mcp__")) { + const meta = toolCall._meta as { codeToolKind?: string } | undefined; + if (readMcpToolName(toolCall._meta)) { return ; } const kind = meta?.codeToolKind ?? (toolCall.kind as string); diff --git a/packages/ui/src/features/posthog-mcp/utils/posthog-exec-display.ts b/packages/ui/src/features/posthog-mcp/utils/posthog-exec-display.ts index 7642bb7ed9..50cda82a30 100644 --- a/packages/ui/src/features/posthog-mcp/utils/posthog-exec-display.ts +++ b/packages/ui/src/features/posthog-mcp/utils/posthog-exec-display.ts @@ -18,7 +18,12 @@ * call [--json] — invoke a tool */ -const POSTHOG_EXEC_TOOL_RE = /^mcp__(?:plugin_)?posthog(?:_[^_]+)*__exec$/; +import { parseMcpToolName } from "@posthog/shared"; + +// A PostHog MCP server name: optional `plugin_` prefix, `posthog`, then any +// number of `_` parts (e.g. `posthog`, `posthog_cloud`, +// `plugin_posthog_posthog`). The `exec` dispatcher lives on these servers. +const POSTHOG_SERVER_RE = /^(?:plugin_)?posthog(?:_[^_]+)*$/; const POSTHOG_VERB_RE = /^\s*(tools|search|info|schema|call)(?:\s+([\s\S]*))?\s*$/; @@ -33,7 +38,8 @@ export interface PostHogExecDisplay { } export function isPostHogExecTool(toolName: string): boolean { - return POSTHOG_EXEC_TOOL_RE.test(toolName); + const mcp = parseMcpToolName(toolName); + return !!mcp && mcp.tool === "exec" && POSTHOG_SERVER_RE.test(mcp.server); } export function getPostHogExecDisplay( diff --git a/packages/ui/src/features/sessions/components/ContextUsageIndicator.test.tsx b/packages/ui/src/features/sessions/components/ContextUsageIndicator.test.tsx new file mode 100644 index 0000000000..49945a0f7d --- /dev/null +++ b/packages/ui/src/features/sessions/components/ContextUsageIndicator.test.tsx @@ -0,0 +1,69 @@ +import type { ContextUsage } from "@posthog/ui/features/sessions/hooks/useContextUsage"; +import { Theme } from "@radix-ui/themes"; +import { render, screen } from "@testing-library/react"; +import { describe, expect, it } from "vitest"; +import { ContextUsageIndicator } from "./ContextUsageIndicator"; + +function usage(overrides?: Partial): ContextUsage { + return { + used: 50_000, + size: 200_000, + percentage: 25, + cost: null, + breakdown: null, + ...overrides, + }; +} + +describe("ContextUsageIndicator", () => { + it("renders nothing when usage is null", () => { + const { container } = render( + + + , + ); + expect(container.querySelector("button")).toBeNull(); + }); + + it("renders the compact used/size label, percentage, and aria-label", () => { + render( + + + , + ); + expect(screen.getByText(/50K\/200K · 25%/)).toBeInTheDocument(); + expect( + screen.getByRole("button", { name: "Context usage: 25%" }), + ).toBeInTheDocument(); + }); + + it("shows only the token count when the context window is unknown (size 0)", () => { + render( + + + , + ); + // No misleading "/0 · 0%" — just the used tokens. + expect(screen.getByText("50K")).toBeInTheDocument(); + expect(screen.queryByText(/\/0/)).not.toBeInTheDocument(); + expect( + screen.getByRole("button", { name: "Context usage: 50K tokens" }), + ).toBeInTheDocument(); + }); + + it("renders a finite stroke offset at 0% (no NaN/Infinity)", () => { + const { container } = render( + + + , + ); + const progress = container.querySelectorAll("circle")[1]; + const offset = Number(progress?.getAttribute("stroke-dashoffset")); + expect(Number.isFinite(offset)).toBe(true); + expect(screen.getByText(/0\/200K · 0%/)).toBeInTheDocument(); + }); +}); diff --git a/packages/ui/src/features/sessions/components/ContextUsageIndicator.tsx b/packages/ui/src/features/sessions/components/ContextUsageIndicator.tsx index 94c0f599a2..1ae3ac46a1 100644 --- a/packages/ui/src/features/sessions/components/ContextUsageIndicator.tsx +++ b/packages/ui/src/features/sessions/components/ContextUsageIndicator.tsx @@ -19,6 +19,9 @@ export function ContextUsageIndicator({ usage }: ContextUsageIndicatorProps) { if (!usage) return null; const { used, size, percentage } = usage; + // The context window can be unknown (size 0) — show just the token count + // rather than a misleading "X/0 · 0%". + const hasSize = size > 0; const strokeDashoffset = CIRCUMFERENCE - (percentage / 100) * CIRCUMFERENCE; const color = getOverallUsageColor(percentage); @@ -28,7 +31,11 @@ export function ContextUsageIndicator({ usage }: ContextUsageIndicatorProps) { diff --git a/packages/ui/src/features/sessions/components/ReasoningLevelSelector.test.tsx b/packages/ui/src/features/sessions/components/ReasoningLevelSelector.test.tsx new file mode 100644 index 0000000000..9631603355 --- /dev/null +++ b/packages/ui/src/features/sessions/components/ReasoningLevelSelector.test.tsx @@ -0,0 +1,89 @@ +import type { SessionConfigOption } from "@agentclientprotocol/sdk"; +import { Theme } from "@radix-ui/themes"; +import { render, screen, waitFor } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { describe, expect, it, vi } from "vitest"; +import { ReasoningLevelSelector } from "./ReasoningLevelSelector"; + +function codexThoughtOption( + overrides?: Partial, +): SessionConfigOption { + return { + type: "select", + id: "effort", + name: "Reasoning effort", + category: "thought_level", + currentValue: "high", + options: [ + { name: "low", value: "low" }, + { name: "high", value: "high" }, + { name: "max", value: "max" }, + ], + ...overrides, + } as unknown as SessionConfigOption; +} + +describe("ReasoningLevelSelector", () => { + it("renders the active level as the trigger label for a codex thought_level option", () => { + render( + + + , + ); + expect( + screen.getByRole("button", { name: "Reasoning: high" }), + ).toBeInTheDocument(); + }); + + it("emits the raw value via onChange once the menu closes", async () => { + const onChange = vi.fn(); + const user = userEvent.setup(); + render( + + + , + ); + + await user.click(screen.getByRole("button", { name: "Reasoning: high" })); + const lowItem = await screen.findByRole("menuitemradio", { name: "low" }); + await user.click(lowItem); + + await waitFor(() => expect(onChange).toHaveBeenCalledWith("low")); + expect(onChange).toHaveBeenCalledTimes(1); + }); + + it("uses the 'Effort' label for the claude adapter", () => { + render( + + + , + ); + expect( + screen.getByRole("button", { name: "Effort: medium" }), + ).toBeInTheDocument(); + }); + + it.each([ + ["undefined option", undefined], + ["non-select type", codexThoughtOption({ type: "boolean" })], + ["empty options", codexThoughtOption({ options: [] })], + ])("renders no trigger for %s", (_label, option) => { + render( + , + ); + expect(screen.queryByRole("button")).not.toBeInTheDocument(); + }); +}); diff --git a/packages/ui/src/features/sessions/components/SessionView.tsx b/packages/ui/src/features/sessions/components/SessionView.tsx index ab832577da..84e7780658 100644 --- a/packages/ui/src/features/sessions/components/SessionView.tsx +++ b/packages/ui/src/features/sessions/components/SessionView.tsx @@ -192,8 +192,16 @@ export function SessionView({ isCloud, allowBypassPermissions, currentModeId, + modeOption, }); - }, [allowBypassPermissions, currentModeId, taskId, isCloud, sessionService]); + }, [ + allowBypassPermissions, + currentModeId, + taskId, + isCloud, + sessionService, + modeOption, + ]); const handleModeChange = useCallback( (nextMode: string) => { diff --git a/packages/ui/src/features/sessions/components/SteerQueueToggle.test.tsx b/packages/ui/src/features/sessions/components/SteerQueueToggle.test.tsx new file mode 100644 index 0000000000..f6f0627b31 --- /dev/null +++ b/packages/ui/src/features/sessions/components/SteerQueueToggle.test.tsx @@ -0,0 +1,75 @@ +import { renderHook } from "@testing-library/react"; +import { beforeEach, describe, expect, it } from "vitest"; +import { useSupportsNativeSteer } from "../hooks/useMessagingMode"; +import { + type AgentSession, + sessionStoreSetters, + useSessionStore, +} from "../sessionStore"; +import { steerQueueTooltip } from "./SteerQueueToggle"; + +function seedSession(overrides: Partial): void { + sessionStoreSetters.setSession({ + taskRunId: "run-1", + taskId: "task-1", + taskTitle: "Test", + channel: "agent-event:run-1", + events: [], + startedAt: 0, + status: "connected", + isPromptPending: false, + isCompacting: false, + promptStartedAt: null, + pendingPermissions: new Map(), + pausedDurationMs: 0, + messageQueue: [], + optimisticItems: [], + ...overrides, + }); +} + +describe("steer tooltip copy follows the session's native-steer capability", () => { + beforeEach(() => { + useSessionStore.setState((state) => { + state.sessions = {}; + state.taskIdIndex = {}; + }); + }); + + it.each([ + { + name: "codex (local): interrupts and resends", + session: { adapter: "codex" as const, isCloud: false }, + expectNative: false, + }, + { + name: "claude cloud: interrupts and resends", + session: { adapter: "claude" as const, isCloud: true }, + expectNative: false, + }, + { + name: "claude (local): folds natively at the next tool boundary", + session: { adapter: "claude" as const, isCloud: false }, + expectNative: true, + }, + ])( + "$name — supportsNativeSteer and rendered tooltip agree", + ({ session, expectNative }) => { + seedSession(session); + + const { result } = renderHook(() => useSupportsNativeSteer("task-1")); + expect(result.current).toBe(expectNative); + + const tooltip = steerQueueTooltip(true, result.current, "Cmd+S"); + if (expectNative) { + expect(tooltip).toContain( + "injects your message mid-turn at the next tool boundary", + ); + } else { + expect(tooltip).toContain( + "interrupts the current turn and resends with your message", + ); + } + }, + ); +}); diff --git a/packages/ui/src/features/sessions/components/SteerQueueToggle.tsx b/packages/ui/src/features/sessions/components/SteerQueueToggle.tsx index 113a56ad79..5b51da75bd 100644 --- a/packages/ui/src/features/sessions/components/SteerQueueToggle.tsx +++ b/packages/ui/src/features/sessions/components/SteerQueueToggle.tsx @@ -16,6 +16,19 @@ interface SteerQueueToggleProps { taskId: string; } +export function steerQueueTooltip( + isSteer: boolean, + supportsNativeSteer: boolean, + shortcut: string, +): string { + if (!isSteer) { + return `Queue: holds messages until the current turn ends. ${shortcut} to switch to Steer.`; + } + return supportsNativeSteer + ? `Steer: injects your message mid-turn at the next tool boundary. ${shortcut} to switch to Queue.` + : `Steer: interrupts the current turn and resends with your message. ${shortcut} to switch to Queue.`; +} + export function SteerQueueToggle({ taskId }: SteerQueueToggleProps) { const mode = useMessagingMode(taskId); const supportsNativeSteer = useSupportsNativeSteer(taskId); @@ -30,11 +43,7 @@ export function SteerQueueToggle({ taskId }: SteerQueueToggleProps) { ? `Queue (${queuedCount})` : "Queue"; - const tooltip = isSteer - ? supportsNativeSteer - ? `Steer: injects your message mid-turn at the next tool boundary. ${shortcut} to switch to Queue.` - : `Steer: interrupts the current turn and resends with your message. ${shortcut} to switch to Queue.` - : `Queue: holds messages until the current turn ends. ${shortcut} to switch to Steer.`; + const tooltip = steerQueueTooltip(isSteer, supportsNativeSteer, shortcut); const colorClass = isSteer ? "text-purple-11" : "text-gray-11"; diff --git a/packages/ui/src/features/sessions/components/UnifiedModelSelector.test.tsx b/packages/ui/src/features/sessions/components/UnifiedModelSelector.test.tsx new file mode 100644 index 0000000000..2ef6396fb2 --- /dev/null +++ b/packages/ui/src/features/sessions/components/UnifiedModelSelector.test.tsx @@ -0,0 +1,133 @@ +import type { + SessionConfigOption, + SessionConfigSelectGroup, +} from "@agentclientprotocol/sdk"; +import { Theme } from "@radix-ui/themes"; +import { render, screen } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { describe, expect, it, vi } from "vitest"; +import { UnifiedModelSelector } from "./UnifiedModelSelector"; + +const groupedCodexModel: SessionConfigOption = { + type: "select", + id: "model", + name: "Model", + category: "model", + currentValue: "gpt-5.5", + options: [ + { + group: "openai", + name: "OpenAI", + options: [ + { value: "gpt-5.5", name: "GPT-5.5" }, + { value: "gpt-5.5-codex", name: "GPT-5.5 Codex" }, + ], + }, + { + group: "fable", + name: "Fable", + options: [{ value: "fable", name: "Fable" }], + }, + ] satisfies SessionConfigSelectGroup[], +}; + +const flatCodexModel: SessionConfigOption = { + type: "select", + id: "model", + name: "Model", + category: "model", + currentValue: "gpt-5.5", + options: [ + { value: "gpt-5.5", name: "GPT-5.5" }, + { value: "fable", name: "Fable" }, + ], +}; + +function renderSelector( + props: Partial> = {}, +) { + return render( + + + , + ); +} + +describe("UnifiedModelSelector", () => { + it("renders the codex adapter label, group labels, and grouped model items", async () => { + const user = userEvent.setup(); + renderSelector(); + + await user.click(screen.getByRole("button", { name: "Model" })); + + // Every model in every group renders as a radio item. + expect( + await screen.findByRole("menuitemradio", { name: "GPT-5.5" }), + ).toBeInTheDocument(); + expect( + screen.getByRole("menuitemradio", { name: "GPT-5.5 Codex" }), + ).toBeInTheDocument(); + expect( + screen.getByRole("menuitemradio", { name: "Fable" }), + ).toBeInTheDocument(); + // Adapter MenuLabel + group MenuLabels render. + expect(screen.getByText("Codex")).toBeInTheDocument(); + expect(screen.getByText("OpenAI")).toBeInTheDocument(); + }); + + it("renders flat (ungrouped) model items", async () => { + const user = userEvent.setup(); + renderSelector({ modelOption: flatCodexModel }); + + await user.click(screen.getByRole("button", { name: "Model" })); + + expect( + await screen.findByRole("menuitemradio", { name: "GPT-5.5" }), + ).toBeInTheDocument(); + expect( + screen.getByRole("menuitemradio", { name: "Fable" }), + ).toBeInTheDocument(); + }); + + it("fires onModelChange exactly once with the picked value after the menu closes", async () => { + const user = userEvent.setup(); + const onModelChange = vi.fn(); + renderSelector({ onModelChange }); + + await user.click(screen.getByRole("button", { name: "Model" })); + await user.click( + await screen.findByRole("menuitemradio", { name: "GPT-5.5 Codex" }), + ); + + expect(onModelChange).toHaveBeenCalledExactlyOnceWith("gpt-5.5-codex"); + }); + + it("switches adapter via the 'Switch to Claude' item", async () => { + const user = userEvent.setup(); + const onAdapterChange = vi.fn(); + renderSelector({ onAdapterChange }); + + await user.click(screen.getByRole("button", { name: "Model" })); + await user.click( + await screen.findByRole("menuitem", { name: /switch to claude/i }), + ); + + expect(onAdapterChange).toHaveBeenCalledExactlyOnceWith("claude"); + }); + + it("renders a disabled loading button with no menu while connecting", () => { + renderSelector({ isConnecting: true }); + + const button = screen.getByRole("button", { name: /loading/i }); + expect(button).toHaveAttribute("aria-disabled", "true"); + expect( + screen.queryByRole("button", { name: "Model" }), + ).not.toBeInTheDocument(); + }); +}); diff --git a/packages/ui/src/features/sessions/components/new-thread/buildThreadGroups.ts b/packages/ui/src/features/sessions/components/new-thread/buildThreadGroups.ts index e4ea31e509..9253179fc1 100644 --- a/packages/ui/src/features/sessions/components/new-thread/buildThreadGroups.ts +++ b/packages/ui/src/features/sessions/components/new-thread/buildThreadGroups.ts @@ -1,4 +1,5 @@ import type { Icon } from "@phosphor-icons/react"; +import { readAgentToolName } from "@posthog/shared"; import type { ConversationItem } from "@posthog/ui/features/sessions/components/buildConversationItems"; import { buildDoneLabel, @@ -66,10 +67,7 @@ export interface ThreadGrouping { } function getToolName(update: { _meta?: unknown }): string | undefined { - const meta = update._meta as - | { claudeCode?: { toolName?: string } } - | undefined; - return meta?.claudeCode?.toolName; + return readAgentToolName(update._meta); } function isMcpToolItem(item: ConversationItem): boolean { diff --git a/packages/ui/src/features/sessions/components/session-update/ToolCallBlock.test.tsx b/packages/ui/src/features/sessions/components/session-update/ToolCallBlock.test.tsx new file mode 100644 index 0000000000..8ac51cfdc8 --- /dev/null +++ b/packages/ui/src/features/sessions/components/session-update/ToolCallBlock.test.tsx @@ -0,0 +1,118 @@ +import { ServiceProvider } from "@posthog/di/react"; +import { posthogToolMeta } from "@posthog/shared"; +import type { ToolCall } from "@posthog/ui/features/sessions/types"; +import { Theme } from "@radix-ui/themes"; +import { render, screen } from "@testing-library/react"; +import { Container } from "inversify"; +import type { ReactNode } from "react"; +import { describe, expect, it, vi } from "vitest"; +import { MCP_TOOL_BLOCK_COMPONENT } from "./identifiers"; +import { ToolCallBlock } from "./ToolCallBlock"; +import type { ToolViewProps } from "./toolCallUtils"; + +// EditToolView's leaf renderers reach outside the unit under test: FileMentionChip +// pulls workspace/tRPC context, and CodePreview mounts a web component that needs +// a real CSSStyleSheet. The edit-routing test only cares that ToolCallBlock +// dispatched to EditToolView, so stub both to their load-bearing inputs. +vi.mock("./FileMentionChip", () => ({ + FileMentionChip: ({ filePath }: { filePath: string }) => ( + {filePath} + ), +})); +vi.mock("./CodePreview", () => ({ + CodePreview: () => code-preview, +})); + +function renderBlock( + toolCall: ToolCall, + mcpToolBlock?: (props: ToolViewProps & { mcpToolName: string }) => ReactNode, +) { + const container = new Container(); + if (mcpToolBlock) { + container.bind(MCP_TOOL_BLOCK_COMPONENT).toConstantValue(mcpToolBlock); + } + return render( + + + + + , + ); +} + +describe("ToolCallBlock codex routing", () => { + it("routes a codex MCP descriptor to the bound McpToolBlock with the canonical name", () => { + const seen: { mcpToolName?: string } = {}; + const McpToolBlock = vi.fn( + ({ mcpToolName }: ToolViewProps & { mcpToolName: string }) => { + seen.mcpToolName = mcpToolName; + return
mcp-block-rendered
; + }, + ); + + renderBlock( + { + toolCallId: "tc-mcp", + title: "exec", + kind: "other", + status: "completed", + rawInput: { query: "select 1" }, + _meta: posthogToolMeta({ + toolName: "mcp__posthog__exec", + mcp: { server: "posthog", tool: "exec" }, + }), + }, + McpToolBlock, + ); + + expect(screen.getByText("mcp-block-rendered")).toBeInTheDocument(); + expect(seen.mcpToolName).toBe("mcp__posthog__exec"); + }); + + it("falls back to the generic tool view for an MCP call when no McpToolBlock is bound", () => { + renderBlock({ + toolCallId: "tc-mcp-fallback", + title: "exec", + kind: "other", + status: "completed", + rawInput: { query: "select 1" }, + _meta: posthogToolMeta({ + toolName: "mcp__posthog__exec", + mcp: { server: "posthog", tool: "exec" }, + }), + }); + + // The MCP branch renders the title in its header; assert it lands somewhere + // (i.e. the call did not blow up unbound) without an MCP block present. + expect(screen.getByText("exec")).toBeInTheDocument(); + }); + + it("routes a codex edit tool call (no _meta) to the edit view with diff stats", () => { + renderBlock({ + toolCallId: "tc-edit", + title: "Edit a.ts", + kind: "edit", + status: "completed", + content: [{ type: "diff", path: "a.ts", oldText: "x", newText: "y" }], + locations: [{ path: "a.ts" }], + }); + + expect(screen.getByText("a.ts")).toBeInTheDocument(); + expect(screen.getByText("+1")).toBeInTheDocument(); + expect(screen.getByText("-1")).toBeInTheDocument(); + }); + + it("routes a codex execute tool call (no _meta) to the execute view header", () => { + renderBlock({ + toolCallId: "tc-exec", + title: "run tests", + kind: "execute", + status: "completed", + rawInput: { command: "pnpm test", description: "Run tests" }, + content: [{ type: "content", content: { type: "text", text: "ok" } }], + }); + + expect(screen.getByText("Run tests")).toBeInTheDocument(); + expect(screen.getByText("pnpm test")).toBeInTheDocument(); + }); +}); diff --git a/packages/ui/src/features/sessions/components/session-update/ToolCallBlock.tsx b/packages/ui/src/features/sessions/components/session-update/ToolCallBlock.tsx index fe3a9b8a73..dcfe1732db 100644 --- a/packages/ui/src/features/sessions/components/session-update/ToolCallBlock.tsx +++ b/packages/ui/src/features/sessions/components/session-update/ToolCallBlock.tsx @@ -1,4 +1,5 @@ import { useServiceOptional } from "@posthog/di/react"; +import { readAgentToolName, readMcpToolName } from "@posthog/shared"; import { DeleteToolView } from "@posthog/ui/features/sessions/components/session-update/DeleteToolView"; import { EditToolView } from "@posthog/ui/features/sessions/components/session-update/EditToolView"; import { ExecuteToolView } from "@posthog/ui/features/sessions/components/session-update/ExecuteToolView"; @@ -36,10 +37,8 @@ export function ToolCallBlock({ const McpToolBlock = useServiceOptional( MCP_TOOL_BLOCK_COMPONENT, ); - const meta = toolCall._meta as - | { claudeCode?: { toolName?: string } } - | undefined; - const toolName = meta?.claudeCode?.toolName; + const toolName = readAgentToolName(toolCall._meta); + const mcpToolName = readMcpToolName(toolCall._meta); const chatChrome = useChatThreadChrome(); if (toolName === "EnterPlanMode") { @@ -70,13 +69,13 @@ export function ToolCallBlock({ ); } - if (toolName?.startsWith("mcp__")) { + if (mcpToolName) { return ( {McpToolBlock ? ( - + ) : ( - + )} ); diff --git a/packages/ui/src/features/sessions/hooks/useMessagingMode.ts b/packages/ui/src/features/sessions/hooks/useMessagingMode.ts index e63d40e790..1d00ceecdc 100644 --- a/packages/ui/src/features/sessions/hooks/useMessagingMode.ts +++ b/packages/ui/src/features/sessions/hooks/useMessagingMode.ts @@ -1,3 +1,4 @@ +import { sessionSupportsNativeSteer } from "@posthog/shared"; import { type MessagingMode, useMessagingModeStore, @@ -15,9 +16,11 @@ export function useMessagingMode(taskId: string | undefined): MessagingMode { } /** - * Whether the task's session steers natively (Claude, local) versus falling - * back to interrupt-and-resend (Codex, cloud). Drives the steer label/tooltip, - * not whether steer is allowed: every adapter supports steer in some form. + * Whether the task's session steers natively (folds a mid-turn message into the + * running turn) versus falling back to interrupt-and-resend. Driven by the + * adapter's negotiated `steering` capability — same decision as the host's + * sendPrompt gate — so Claude and codex app-server steer, codex-acp and cloud + * resend. Drives the steer label/tooltip, not whether steer is allowed. */ export function useSupportsNativeSteer(taskId: string | undefined): boolean { return useSessionStore((s) => { @@ -25,6 +28,6 @@ export function useSupportsNativeSteer(taskId: string | undefined): boolean { const taskRunId = s.taskIdIndex[taskId]; if (!taskRunId) return false; const session = s.sessions[taskRunId]; - return !!session && !session.isCloud && session.adapter === "claude"; + return !!session && sessionSupportsNativeSteer(session); }); } diff --git a/packages/ui/src/features/sessions/sessionServiceHost.recovery.integration.test.ts b/packages/ui/src/features/sessions/sessionServiceHost.recovery.integration.test.ts index bb44ec8658..abb7924fa0 100644 --- a/packages/ui/src/features/sessions/sessionServiceHost.recovery.integration.test.ts +++ b/packages/ui/src/features/sessions/sessionServiceHost.recovery.integration.test.ts @@ -213,6 +213,10 @@ vi.mock("@posthog/ui/features/sidebar/taskMetaApi", () => ({ vi.mock("@posthog/ui/shell/posthogAnalyticsImpl", () => ({ track: vi.fn(), buildPermissionToolMetadata: vi.fn(() => ({})), + posthogFeatureFlags: { + isEnabled: vi.fn(() => undefined), + onFlagsLoaded: vi.fn(), + }, })); vi.mock("../../shell/logger", () => ({ logger: { diff --git a/packages/ui/src/features/sessions/sessionServiceHost.test.ts b/packages/ui/src/features/sessions/sessionServiceHost.test.ts index 49e3c443d9..84144af24a 100644 --- a/packages/ui/src/features/sessions/sessionServiceHost.test.ts +++ b/packages/ui/src/features/sessions/sessionServiceHost.test.ts @@ -244,6 +244,10 @@ vi.mock("@posthog/ui/features/sidebar/taskMetaApi", () => ({ vi.mock("@posthog/ui/shell/posthogAnalyticsImpl", () => ({ track: vi.fn(), buildPermissionToolMetadata: vi.fn(() => ({})), + posthogFeatureFlags: { + isEnabled: vi.fn(() => undefined), + onFlagsLoaded: vi.fn(), + }, })); vi.mock("../../shell/logger", () => ({ logger: { @@ -899,6 +903,7 @@ describe("SessionService", () => { id: "mode", currentValue: "full-access", options: [ + expect.objectContaining({ value: "plan" }), expect.objectContaining({ value: "read-only" }), expect.objectContaining({ value: "auto" }), expect.objectContaining({ value: "full-access" }), diff --git a/packages/ui/src/features/sessions/sessionServiceHost.ts b/packages/ui/src/features/sessions/sessionServiceHost.ts index 9a57f913e7..c0d0e30b80 100644 --- a/packages/ui/src/features/sessions/sessionServiceHost.ts +++ b/packages/ui/src/features/sessions/sessionServiceHost.ts @@ -37,6 +37,7 @@ import { WORKSPACE_QUERY_KEY } from "@posthog/ui/features/workspace/identifiers" import { toast } from "@posthog/ui/primitives/toast"; import { buildPermissionToolMetadata, + posthogFeatureFlags, track, } from "@posthog/ui/shell/posthogAnalyticsImpl"; import { logger } from "../../shell/logger"; @@ -80,6 +81,7 @@ function buildSessionServiceDeps(): SessionServiceDeps { ); }, buildPermissionToolMetadata, + featureFlags: posthogFeatureFlags, notifyPermissionRequest: (taskTitle, taskId) => resolveService(NotificationBus).notifyPermissionRequest( taskTitle, diff --git a/packages/workspace-server/src/services/agent/agent.ts b/packages/workspace-server/src/services/agent/agent.ts index 83341d0c1e..b07de98a48 100644 --- a/packages/workspace-server/src/services/agent/agent.ts +++ b/packages/workspace-server/src/services/agent/agent.ts @@ -262,6 +262,12 @@ interface SessionConfig { /** The agent's session ID (for resume - SDK session ID for Claude, Codex's session ID for Codex) */ sessionId?: string; adapter?: "claude" | "codex"; + /** + * Resolved `codex-app-server` flag for the current user. When true and the + * adapter is codex, the agent uses the native app-server sub-adapter; when + * false/undefined it uses codex-acp. Ignored by the Claude adapter. + */ + useCodexAppServer?: boolean; /** Permission mode to use for the session */ permissionMode?: string; /** Custom instructions injected into the system prompt */ @@ -284,6 +290,16 @@ interface SessionConfig { importedSessionId?: string; } +/** Pull the adapter's `agentCapabilities._meta.posthog.steering` from initialize. */ +function extractSteeringCapability(init: unknown): string | undefined { + const steering = ( + init as { + agentCapabilities?: { _meta?: { posthog?: { steering?: unknown } } }; + } + )?.agentCapabilities?._meta?.posthog?.steering; + return typeof steering === "string" ? steering : undefined; +} + interface ManagedSession { taskRunId: string; taskId: string; @@ -298,6 +314,8 @@ interface ManagedSession { promptPending: boolean; pendingContext?: string; configOptions?: SessionConfigOption[]; + /** Adapter's negotiated steering capability from initialize (`_meta.posthog.steering`). */ + steering?: string; /** Tracks in-flight MCP tool calls (toolCallId → toolKey) for cancellation */ inFlightMcpToolCalls: Map; /** MCP tool approval states fetched at session start */ @@ -675,6 +693,7 @@ If a repository IS genuinely required, attach one in this priority order: credentials, logUrl, adapter, + useCodexAppServer, permissionMode, customInstructions, systemPromptOverride, @@ -787,6 +806,7 @@ If a repository IS genuinely required, attach one in this priority order: const acpConnection = await agent.run(taskId, taskRunId, { adapter, + useCodexAppServer, gatewayUrl: proxyUrl, codexBinaryPath: adapter === "codex" ? this.getCodexBinaryPath() : undefined, @@ -839,7 +859,7 @@ If a repository IS genuinely required, attach one in this priority order: clientStreams, ); - await connection.initialize({ + const initResult = await connection.initialize({ protocolVersion: PROTOCOL_VERSION, clientCapabilities: { fs: { @@ -849,6 +869,11 @@ If a repository IS genuinely required, attach one in this priority order: terminal: true, }, }); + // The adapter advertises whether mid-turn steering folds natively into the + // running turn (`steering: "native"`) vs needs cancel+resend. Surface it so + // the host gates steer-vs-resend on the negotiated capability, not on a + // hardcoded adapter name (codex-acp advertises "interrupt-resend"). + const steering = extractSteeringCapability(initResult); const { servers: mcpServers, @@ -1054,6 +1079,7 @@ If a repository IS genuinely required, attach one in this priority order: config, promptPending: false, configOptions, + steering, inFlightMcpToolCalls: new Map(), mcpToolApprovals: toolApprovals, toolInstallations, @@ -1901,6 +1927,8 @@ For git operations while detached: logUrl: "logUrl" in params ? params.logUrl : undefined, sessionId: "sessionId" in params ? params.sessionId : undefined, adapter: "adapter" in params ? params.adapter : undefined, + useCodexAppServer: + "useCodexAppServer" in params ? params.useCodexAppServer : undefined, permissionMode: "permissionMode" in params ? params.permissionMode : undefined, customInstructions: @@ -1924,6 +1952,7 @@ For git operations while detached: sessionId: session.taskRunId, channel: session.channel, configOptions: session.configOptions, + steering: session.steering, }; } diff --git a/packages/workspace-server/src/services/agent/schemas.ts b/packages/workspace-server/src/services/agent/schemas.ts index 493e79943e..477630edfb 100644 --- a/packages/workspace-server/src/services/agent/schemas.ts +++ b/packages/workspace-server/src/services/agent/schemas.ts @@ -52,6 +52,12 @@ export const startSessionInput = z.object({ autoProgress: z.boolean().optional(), runMode: z.enum(["local", "cloud"]).optional(), adapter: z.enum(["claude", "codex"]).optional(), + /** + * Resolved value of the `codex-app-server` PostHog flag (evaluated host-side + * for the current user). When true and adapter is "codex", the agent uses the + * native app-server sub-adapter instead of codex-acp. Ignored for Claude. + */ + useCodexAppServer: z.boolean().optional(), additionalDirectories: z.array(z.string()).optional(), customInstructions: z.string().max(2000).optional(), /** @@ -136,6 +142,11 @@ export const sessionResponseSchema = z.object({ sessionId: z.string(), channel: z.string(), configOptions: z.array(sessionConfigOptionSchema).optional(), + // The adapter's negotiated steering capability from initialize + // (`_meta.posthog.steering`): "native" folds a mid-turn message into the + // running turn; "interrupt-resend" (codex-acp) or absent means the host must + // cancel + resend instead. Drives the host's steer-vs-resend decision. + steering: z.string().optional(), }); export type SessionResponse = z.infer; @@ -194,6 +205,8 @@ export const reconnectSessionInput = z.object({ logUrl: z.string().optional(), sessionId: z.string().optional(), adapter: z.enum(["claude", "codex"]).optional(), + /** See startSessionInput.useCodexAppServer — re-resolved on reconnect. */ + useCodexAppServer: z.boolean().optional(), /** Additional directories Claude can access beyond cwd (for worktree support) */ additionalDirectories: z.array(z.string()).optional(), permissionMode: z.string().optional(), diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 3cf4edaa57..d606082cb9 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -700,6 +700,9 @@ importers: '@modelcontextprotocol/sdk': specifier: 1.29.0 version: 1.29.0(zod@4.4.3) + '@openai/codex': + specifier: 0.140.0 + version: 0.140.0 '@opentelemetry/api-logs': specifier: ^0.208.0 version: 0.208.0 @@ -3878,6 +3881,47 @@ packages: '@open-draft/until@2.1.0': resolution: {integrity: sha512-U69T3ItWHvLwGg5eJ0n3I62nWuE6ilHlmz7zM0npLBRvPRd7e6NYmg54vvRtP5mZG7kZqZCFVdsTWo7BPtBujg==} + '@openai/codex@0.140.0': + resolution: {integrity: sha512-FMnN12kJzVPljMTYRydLCNgd0cXXmVasNSfq2PtS42RMEIxoQ3dHtMvmno35hu2tfwrKNAAPCm4s+2PaFTEBGg==} + engines: {node: '>=16'} + hasBin: true + + '@openai/codex@0.140.0-darwin-arm64': + resolution: {integrity: sha512-KDyQHsxdc8FHZKziSBXs82ABgben/8lLPdhi2Nu+wj6qs2RAp4k/IvE8foafVnp3OeGqhtEFbhlZp0H4Dg/Slg==} + engines: {node: '>=16'} + cpu: [arm64] + os: [darwin] + + '@openai/codex@0.140.0-darwin-x64': + resolution: {integrity: sha512-xA77AcKbP8BKxKqaJz8bqXtU1dUtanEKpWCMJ68LuYU054EC31BD7NftFe5/vpLUQR95fhRr7V9a91SLtCuLAg==} + engines: {node: '>=16'} + cpu: [x64] + os: [darwin] + + '@openai/codex@0.140.0-linux-arm64': + resolution: {integrity: sha512-rGOgWEONilm+pQoQgcGpPRzvnou1CawyBOe8gvtuS32PQ00Pn+9nZF4O7iKBVlNh6Jeun8kpdJSjFdULm2wr4A==} + engines: {node: '>=16'} + cpu: [arm64] + os: [linux] + + '@openai/codex@0.140.0-linux-x64': + resolution: {integrity: sha512-7+N/cHB74nsDkOoL+VQVFVFRlfGj6GFSIAQHgs9DQIsvG+UdzWgUeeDE3l926taJqmzcP9NH8bysptKlZ2Ff6g==} + engines: {node: '>=16'} + cpu: [x64] + os: [linux] + + '@openai/codex@0.140.0-win32-arm64': + resolution: {integrity: sha512-vs5Ed5OF+4671SZoO0MN5WoHl/K9aOSNzLgzbyyDyM7Jwm/PZYvF6OmIPRWf5AGatYqEOWt8Ovp5+df5PFPM7A==} + engines: {node: '>=16'} + cpu: [arm64] + os: [win32] + + '@openai/codex@0.140.0-win32-x64': + resolution: {integrity: sha512-dP+nzd8UQ3Gdby+F5x0Sxd0hu6V9s6/cZYFsGtmmA6eCpU+IIu5tCOnUfgSu5HDw4BvXg046yd8Ihy5bOhwO4A==} + engines: {node: '>=16'} + cpu: [x64] + os: [win32] + '@opentelemetry/api-logs@0.208.0': resolution: {integrity: sha512-CjruKY9V6NMssL/T1kAFgzosF1v9o6oeN+aX5JB/C/xPNtmgIJqcXHG7fA82Ou1zCpWGl4lROQUKwUNE1pMCyg==} engines: {node: '>=8.0.0'} @@ -16288,6 +16332,33 @@ snapshots: '@open-draft/until@2.1.0': {} + '@openai/codex@0.140.0': + optionalDependencies: + '@openai/codex-darwin-arm64': '@openai/codex@0.140.0-darwin-arm64' + '@openai/codex-darwin-x64': '@openai/codex@0.140.0-darwin-x64' + '@openai/codex-linux-arm64': '@openai/codex@0.140.0-linux-arm64' + '@openai/codex-linux-x64': '@openai/codex@0.140.0-linux-x64' + '@openai/codex-win32-arm64': '@openai/codex@0.140.0-win32-arm64' + '@openai/codex-win32-x64': '@openai/codex@0.140.0-win32-x64' + + '@openai/codex@0.140.0-darwin-arm64': + optional: true + + '@openai/codex@0.140.0-darwin-x64': + optional: true + + '@openai/codex@0.140.0-linux-arm64': + optional: true + + '@openai/codex@0.140.0-linux-x64': + optional: true + + '@openai/codex@0.140.0-win32-arm64': + optional: true + + '@openai/codex@0.140.0-win32-x64': + optional: true + '@opentelemetry/api-logs@0.208.0': dependencies: '@opentelemetry/api': 1.9.0