diff --git a/.changeset/fix-f7-destroy-tombstone.md b/.changeset/fix-f7-destroy-tombstone.md new file mode 100644 index 0000000..6a9b85e --- /dev/null +++ b/.changeset/fix-f7-destroy-tombstone.md @@ -0,0 +1,25 @@ +--- +"sql-fs-api": minor +--- + +fix(session): destroy now reaches warm sessions on other replicas (F7). + +Destroying a sandbox on one replica previously left warm sessions on other +replicas serving ghost state: a written session would reload a deleted tree +into an empty pathCache (surfacing as a non-zero exit + garbage stderr inside an +HTTP 200 exec), and a never-written session would never reload at all because +the deleted version key read as 0 and matched its `lastSeenVersion === 0`. + +Two layered fixes: + +- Primary (Redis-independent): `SqlFs.reload()` now detects a zero-row + `loadAllPaths` — which for a live sandbox always returns at least its root dir + — and throws a typed `ESANDBOXGONE` instead of installing an empty pathCache. + The session manager catches it, tears the stale warm session down (drops it + from the pool and disconnects the per-session Postgres pool), and surfaces a + clean `ENOENT` → 404. +- Secondary (tombstone): `destroy` now writes a distinct `DESTROYED` sentinel to + the version key (with the version-key TTL) instead of deleting it. + `ensureFreshCache` recognises the sentinel before the numeric parse and tears + the session down — covering the never-written variant. Re-creating a + tombstoned sandbox clears the sentinel and starts cleanly at version 0. diff --git a/src/api/session-manager.ts b/src/api/session-manager.ts index fc9e423..e9edfb8 100644 --- a/src/api/session-manager.ts +++ b/src/api/session-manager.ts @@ -15,9 +15,10 @@ import type { Redis } from "ioredis"; import { Bash } from "just-bash"; import type { BashExecResult, DefenseInDepthConfig, ExecOptions, IFileSystem, SecurityViolation } from "just-bash"; +import { createEnoent } from "../sql-fs/errors.js"; import { createPostgresSandboxFs, destroyPostgresSandbox } from "../sql-fs/index.js"; import type { RedisBlobCache } from "../sql-fs/redis-blob-cache.js"; -import { type RedisPathSnapshot, versionKey } from "../sql-fs/redis-path-snapshot.js"; +import { type RedisPathSnapshot, VERSION_TOMBSTONE, versionKey } from "../sql-fs/redis-path-snapshot.js"; import { SessionScopedFs } from "../sql-fs/session-scoped-fs.js"; import type { ICoherentFs, IReadOnlyScopeFs, IScriptTxFs } from "../sql-fs/sql-fs.js"; import type { PathCacheEntry, SandboxListEntry, SandboxMeta } from "../sql-fs/types.js"; @@ -575,7 +576,19 @@ export class SessionManager { try { // GETEX so rehydrating a session refreshes the counter's TTL (audit H6). const raw = await this.redis.getex(versionKey(tenantId, sandboxId), "EX", VERSION_KEY_TTL_SECONDS); - initialVersion = raw === null ? 0 : Number(raw) || 0; + if (raw === VERSION_TOMBSTONE) { + // F7: a tombstone here means the sandbox was destroyed but + // `buildFs` (createSandbox) just re-inserted the root — this + // session owns a freshly re-created sandbox. Clear the sentinel + // so a later `incr` does not run against the non-numeric string + // (which Redis rejects) and so a sibling warm session is not + // torn down for a sandbox that is alive again. Start at 0, the + // fresh-sandbox baseline. + await this.redis.del(versionKey(tenantId, sandboxId)); + initialVersion = 0; + } else { + initialVersion = raw === null ? 0 : Number(raw) || 0; + } } catch { initialVersion = 0; } @@ -699,6 +712,11 @@ export class SessionManager { try { await this.ensureFreshCache(tenantId, sandboxId, session); } catch (err) { + // F7: ensureFreshCache tears the session down and throws ENOENT when the + // sandbox is gone (tombstone or zero-row reload). That ENOENT is the + // definitive answer — surface it as a clean 404. Do NOT remap it to + // ESESSIONCLOSING below just because teardown set state="closing". + if ((err as Error & { code?: string }).code === "ENOENT") throw err; if (this.isClosing(session)) { throw Object.assign(new Error("ESESSIONCLOSING: session is shutting down, retry"), { code: "ESESSIONCLOSING", @@ -789,6 +807,9 @@ export class SessionManager { try { await this.ensureFreshCache(tenantId, sandboxId, session); } catch (err) { + // F7: a gone-sandbox teardown surfaces ENOENT — the definitive 404 + // answer. Don't remap it to ESESSIONCLOSING (see withSessionEntry). + if ((err as Error & { code?: string }).code === "ENOENT") throw err; if (this.isClosing(session)) { throw Object.assign(new Error("ESESSIONCLOSING: session is shutting down, retry"), { code: "ESESSIONCLOSING", @@ -915,17 +936,28 @@ export class SessionManager { // be holding a `lastSeenVersion` when the key expires and the counter // resets to 1 — closing the stale-read / lost-update wrap (audit H6). const raw = await this.redis!.getex(versionKey(tenantId, sandboxId), "EX", VERSION_KEY_TTL_SECONDS); + // F7: recognise the destroy tombstone BEFORE the numeric parse — + // `Number("DESTROYED") || 0` would clamp to 0 and, for a + // never-written session (lastSeenVersion === 0), silently match + // (`0 === 0`) so no reload would ever fire and the warm session + // would serve ghost state. Tear the session down and surface a + // clean ENOENT → 404 instead. + if (raw === VERSION_TOMBSTONE) { + await this.tearDownGoneSession(tenantId, sandboxId, session); + throw createEnoent(`/${sandboxId}`); + } current = raw === null ? 0 : Number(raw) || 0; } catch (err) { + if ((err as Error & { code?: string }).code === "ENOENT") throw err; // Redis unavailable — can't determine whether the cache is fresh. // Reload from Postgres so we don't serve stale data. console.error(JSON.stringify({ event: "version_get_error", sandboxId, error: (err as Error).message })); - await coherent.reload(); + await this.reloadOrTearDownGone(tenantId, sandboxId, session, coherent); return; } if (session.lastSeenVersion !== current) { - await coherent.reload(); + await this.reloadOrTearDownGone(tenantId, sandboxId, session, coherent); session.lastSeenVersion = current; coherent.clearDirty(); } @@ -937,6 +969,48 @@ export class SessionManager { return probe; } + /** + * F7 (primary, Redis-independent): wrap `reload()` so a zero-row + * `loadAllPaths` — which `SqlFs` raises as ESANDBOXGONE — tears the warm + * session down and surfaces a clean ENOENT instead of installing an empty + * pathCache (which would serve ghost ENOENTs / break cwd resolution inside + * bash.exec). Any other reload error propagates unchanged. + */ + private async reloadOrTearDownGone( + tenantId: string, + sandboxId: string, + session: Session, + coherent: ICoherentFs, + ): Promise { + try { + await coherent.reload(); + } catch (err) { + if ((err as Error & { code?: string }).code === "ESANDBOXGONE") { + await this.tearDownGoneSession(tenantId, sandboxId, session); + throw createEnoent(`/${sandboxId}`); + } + throw err; + } + } + + /** + * F7: tear down a warm session whose sandbox has been destroyed on another + * replica. Mirrors the reaper teardown — mark `closing` so any straggler + * that already captured this reference observes ESESSIONCLOSING, drop it from + * the map, and disconnect the per-session Postgres pool so it cannot leak. + * Idempotent: a session already `closing` (concurrent destroy/reaper) is left + * for that path to finish. + */ + private async tearDownGoneSession(tenantId: string, sandboxId: string, session: Session): Promise { + const key = this.sessionKey(tenantId, sandboxId); + if (session.state !== "closing") { + session.state = "closing"; + } + this.sessions.delete(key); + await this.disconnectFs(session.fs); + console.error(JSON.stringify({ event: "session_torn_down_sandbox_gone", tenantId, sandboxId })); + } + private async publishVersionIfDirty(tenantId: string, sandboxId: string, session: Session): Promise { if (this.redis === undefined) return; const coherent = asCoherentFs(session.fs); @@ -1191,8 +1265,10 @@ export class SessionManager { // rethrow at the end. let cleanupError: unknown; try { + let destroySucceeded = false; try { await this.destroySandboxFn(tenantId, sandboxId); + destroySucceeded = true; } catch (e) { cleanupError ??= e; console.error( @@ -1203,17 +1279,19 @@ export class SessionManager { }), ); } - try { - await this.deleteVersionKey(tenantId, sandboxId); - } catch (e) { - cleanupError ??= e; - } - if (this.pathSnapshot !== undefined) { + if (destroySucceeded) { try { - await this.pathSnapshot.delete(tenantId, sandboxId); + await this.deleteVersionKey(tenantId, sandboxId); } catch (e) { cleanupError ??= e; } + if (this.pathSnapshot !== undefined) { + try { + await this.pathSnapshot.delete(tenantId, sandboxId); + } catch (e) { + cleanupError ??= e; + } + } } } finally { await this.disconnectFs(session.fs); @@ -1227,10 +1305,22 @@ export class SessionManager { }); } + /** + * F7: tombstone the version key on destroy instead of `DEL`. + * + * A bare `DEL` re-introduces the "counter-absent == 0" conflation: a warm + * session on another replica probes the absent key, reads `0`, and either + * reloads a deleted tree (written variant) or — if it never wrote and so + * holds `lastSeenVersion === 0` — sees `0 === 0` and NEVER reloads, serving + * ghost state forever. Writing a distinct, non-numeric sentinel + * (`VERSION_TOMBSTONE`) lets `ensureFreshCache` recognise the destroy and + * tear the session down regardless of its last-seen version. The sentinel + * inherits the version-key TTL so it self-cleans like any other counter. + */ private async deleteVersionKey(tenantId: string, sandboxId: string): Promise { if (this.redis === undefined) return; try { - await this.redis.del(versionKey(tenantId, sandboxId)); + await this.redis.set(versionKey(tenantId, sandboxId), VERSION_TOMBSTONE, "EX", VERSION_KEY_TTL_SECONDS); } catch { // swallow — best-effort cleanup } diff --git a/src/api/tests/unit/session-manager.f7-destroy-tombstone.test.ts b/src/api/tests/unit/session-manager.f7-destroy-tombstone.test.ts new file mode 100644 index 0000000..96e73b3 --- /dev/null +++ b/src/api/tests/unit/session-manager.f7-destroy-tombstone.test.ts @@ -0,0 +1,344 @@ +/** + * F7 unit tests: destroy must reach warm sessions on other replicas. + * + * Two layered defences: + * - Primary (Redis-independent): a zero-row `loadAllPaths` surfaces as + * ESANDBOXGONE from `reload()`; `ensureFreshCache` tears the warm session + * down and converts it to a clean ENOENT (→ 404) instead of installing an + * empty pathCache (ghost state). + * - Secondary (tombstone): `destroy` writes a DESTROYED sentinel to the version + * key; `ensureFreshCache` recognises it BEFORE the numeric parse and tears + * down — covering the never-written variant (lastSeenVersion === 0) where an + * absent key would read as 0 and never trigger a reload. + */ + +import type { Redis } from "ioredis"; +import type { IFileSystem } from "just-bash"; +import { describe, expect, it, vi } from "vitest"; +import { createEsandboxgone } from "../../../sql-fs/errors.js"; +import { SessionManager } from "../../session-manager.js"; + +interface Entry { + value: string; + expiresAt: number; +} + +/** + * Fake Redis covering the version-counter ops AND the distributed RW-lock + * eval scripts (so withExecLockExclusive/destroy run). Mirrors the fake in + * session-manager.version-counter.test.ts. + */ +class FakeRedis { + store = new Map(); // version keys + strings = new Map(); // lock writer keys (eval-managed) + zsets = new Map>(); + + private gc(): void { + const now = Date.now(); + for (const [k, e] of this.store) if (e.expiresAt <= now) this.store.delete(k); + for (const [k, e] of this.strings) if (e.expiresAt <= now) this.strings.delete(k); + } + + private reapZset(key: string, nowMs: number): void { + const z = this.zsets.get(key); + if (!z) return; + for (const [m, score] of z) if (score <= nowMs) z.delete(m); + } + + private getZset(key: string): Map { + let z = this.zsets.get(key); + if (!z) { + z = new Map(); + this.zsets.set(key, z); + } + return z; + } + + async set(key: string, value: string, unit: "PX" | "EX", amount: number, nx?: "NX"): Promise<"OK" | null> { + this.gc(); + if (nx === "NX" && this.store.has(key)) return null; + const ms = unit === "EX" ? amount * 1000 : amount; + this.store.set(key, { value, expiresAt: Date.now() + ms }); + return "OK"; + } + + async get(key: string): Promise { + this.gc(); + return this.store.get(key)?.value ?? null; + } + + async getex(key: string, _ex: "EX", seconds: number): Promise { + this.gc(); + const e = this.store.get(key); + if (e === undefined) return null; + e.expiresAt = Date.now() + seconds * 1000; + return e.value; + } + + async incr(key: string): Promise { + this.gc(); + const current = Number(this.store.get(key)?.value ?? "0") || 0; + const next = current + 1; + this.store.set(key, { value: String(next), expiresAt: Date.now() + 60_000 }); + return next; + } + + async expire(key: string, seconds: number): Promise { + const e = this.store.get(key); + if (e === undefined) return 0; + e.expiresAt = Date.now() + seconds * 1000; + return 1; + } + + async del(key: string): Promise { + return this.store.delete(key) ? 1 : 0; + } + + async eval(script: string, numKeys: number, ...args: string[]): Promise { + this.gc(); + const keys = args.slice(0, numKeys); + const argv = args.slice(numKeys); + if (script.includes("ZREMRANGEBYSCORE") && script.includes("EXISTS")) { + const [writerKey, readersKey] = keys as [string, string]; + const [token, nowStr, expireAtStr] = argv as [string, string, string]; + this.reapZset(readersKey, Number(nowStr)); + if (this.strings.has(writerKey)) return 0; + this.getZset(readersKey).set(token, Number(expireAtStr)); + return 1; + } + if (script.includes("ZREM") && !script.includes("ZREMRANGEBYSCORE")) { + const [readersKey] = keys as [string]; + const [token] = argv as [string]; + const z = this.zsets.get(readersKey); + if (z?.has(token)) { + z.delete(token); + return 1; + } + return 0; + } + if (script.includes("ZSCORE")) { + const [readersKey] = keys as [string]; + const [token, expireAtStr] = argv as [string, string]; + const z = this.zsets.get(readersKey); + if (z?.has(token)) { + z.set(token, Number(expireAtStr)); + return 1; + } + return 0; + } + if (script.includes("SET") && script.includes("NX")) { + const [writerKey] = keys as [string]; + const [token, leaseMsStr] = argv as [string, string]; + if (this.strings.has(writerKey)) return null; + this.strings.set(writerKey, { value: token, expiresAt: Date.now() + Number(leaseMsStr) }); + return "OK"; + } + if (script.includes("ZCARD")) { + const [writerKey, readersKey] = keys as [string, string]; + const [token, nowStr] = argv as [string, string]; + const entry = this.strings.get(writerKey); + if (entry?.value !== token) return -1; + this.reapZset(readersKey, Number(nowStr)); + return this.zsets.get(readersKey)?.size ?? 0; + } + if (script.includes("PEXPIRE")) { + const [writerKey] = keys as [string]; + const [token, leaseMsStr] = argv as [string, string]; + const entry = this.strings.get(writerKey); + if (entry?.value === token) { + entry.expiresAt = Date.now() + Number(leaseMsStr); + return 1; + } + return 0; + } + if (script.includes("DEL")) { + const [writerKey] = keys as [string]; + const [token] = argv as [string]; + const entry = this.strings.get(writerKey); + if (entry?.value === token) { + this.strings.delete(writerKey); + return 1; + } + return 0; + } + throw new Error(`FakeRedis: unrecognised eval script: ${script.slice(0, 60)}`); + } +} + +function asRedis(f: FakeRedis): Redis { + return f as unknown as Redis; +} + +/** + * Coherent FS stub. `disconnect` records teardown; `reloadGone` makes the next + * reload() throw ESANDBOXGONE to simulate a zero-row loadAllPaths. + */ +class StubCoherentFs { + dirty = false; + reloadCount = 0; + disconnectCount = 0; + reloadGone = false; + + getAllPaths(): string[] { + return []; + } + async reload(): Promise { + this.reloadCount++; + if (this.reloadGone) throw createEsandboxgone("sbx"); + this.dirty = false; + } + wasDirty(): boolean { + return this.dirty; + } + clearDirty(): void { + this.dirty = false; + } + poisoned(): boolean { + return false; + } + async disconnect(): Promise { + this.disconnectCount++; + } +} + +function makeFsFactory(instance: StubCoherentFs): (_tenantId: string, _sandboxId: string) => Promise { + return vi.fn(async () => instance as unknown as IFileSystem); +} + +describe("SessionManager F7 — destroy reaches warm replicas", () => { + it("(a) reload's ESANDBOXGONE becomes a teardown + clean ENOENT", async () => { + const redis = new FakeRedis(); + const stub = new StubCoherentFs(); + const sm = new SessionManager({ createFs: makeFsFactory(stub), redis: asRedis(redis) }); + + // Warm the session at version 0 (a write happened on this replica earlier). + await sm.withSession("default", "sbx", async () => {}); + + // Another replica destroyed the sandbox: the row is gone AND the version + // key advanced (written variant). Bump the counter so ensureFreshCache + // mismatches and reloads — but loadAllPaths now returns zero rows. + redis.store.set("vfs:default:ver:sbx", { value: "9", expiresAt: Date.now() + 60_000 }); + stub.reloadGone = true; + + await expect(sm.withExistingSession("default", "sbx", async () => {})).rejects.toMatchObject({ code: "ENOENT" }); + + // Session torn down (dropped from map + PG pool disconnected). NOT serving + // an empty cache. + expect(sm.getSession("default", "sbx")).toBeUndefined(); + expect(stub.disconnectCount).toBe(1); + expect(stub.reloadCount).toBe(1); + }); + + it("(b) DESTROYED tombstone is recognised before the numeric parse → teardown + ENOENT", async () => { + const redis = new FakeRedis(); + const stub = new StubCoherentFs(); + const sm = new SessionManager({ createFs: makeFsFactory(stub), redis: asRedis(redis) }); + + await sm.withSession("default", "sbx", async () => { + stub.dirty = true; // version → 1 + }); + expect(sm.getSession("default", "sbx")?.lastSeenVersion).toBe(1); + + // Replica A destroys the sandbox → tombstone written. + redis.store.set("vfs:default:ver:sbx", { value: "DESTROYED", expiresAt: Date.now() + 60_000 }); + + await expect(sm.withExistingSession("default", "sbx", async () => {})).rejects.toMatchObject({ code: "ENOENT" }); + + expect(sm.getSession("default", "sbx")).toBeUndefined(); + expect(stub.disconnectCount).toBe(1); + // Tombstone caught before parse → no reload attempted. + expect(stub.reloadCount).toBe(0); + }); + + it("(c) never-written variant (lastSeenVersion 0, tombstone) still tears down — not the 0===0 silent ghost", async () => { + // Replica B warms a pure-READ session on a sandbox it never wrote to, so + // its lastSeenVersion stays 0. Replica A then destroys the sandbox → + // tombstone. With a bare DEL the version key would be ABSENT → read as 0 → + // 0 === 0 → NO reload would ever fire → B serves ghost reads forever. The + // sentinel breaks the tie even at lastSeenVersion 0. + const redis = new FakeRedis(); + const warm = new StubCoherentFs(); + const smB = new SessionManager({ createFs: makeFsFactory(warm), redis: asRedis(redis) }); + + await smB.withSession("default", "sbx", async () => { + /* pure read, no write */ + }); + const session = smB.getSession("default", "sbx"); + expect(session?.lastSeenVersion).toBe(0); + expect(redis.store.has("vfs:default:ver:sbx")).toBe(false); + + // Replica A's destroy writes the tombstone (separate manager, shared Redis). + const smA = new SessionManager({ + createFs: makeFsFactory(new StubCoherentFs()), + destroySandboxFn: vi.fn().mockResolvedValue(undefined), + redis: asRedis(redis), + }); + await smA.destroy("default", "sbx"); + expect(redis.store.get("vfs:default:ver:sbx")?.value).toBe("DESTROYED"); + + // B's next turn: the sentinel forces teardown despite lastSeenVersion 0. + await expect(smB.withExistingSession("default", "sbx", async () => {})).rejects.toMatchObject({ code: "ENOENT" }); + expect(smB.getSession("default", "sbx")).toBeUndefined(); + expect(warm.disconnectCount).toBe(1); + expect(warm.reloadCount).toBe(0); + }); + + it("re-creating a tombstoned sandbox clears the sentinel and starts at version 0", async () => { + const redis = new FakeRedis(); + // Tombstone left by a prior destroy. + redis.store.set("vfs:default:ver:sbx", { value: "DESTROYED", expiresAt: Date.now() + 60_000 }); + + const stub = new StubCoherentFs(); + const sm = new SessionManager({ createFs: makeFsFactory(stub), redis: asRedis(redis) }); + + // A fresh exec re-creates the sandbox (buildFs.createSandbox re-inserts the + // root). The session must start clean — sentinel cleared, version 0 — so a + // later INCR does not run against the non-numeric "DESTROYED". + await sm.withSession("default", "sbx", async () => {}); + const session = sm.getSession("default", "sbx"); + expect(session?.lastSeenVersion).toBe(0); + expect(redis.store.has("vfs:default:ver:sbx")).toBe(false); + + // A subsequent write publishes a real numeric version. + await sm.withSession("default", "sbx", async () => { + stub.dirty = true; + }); + expect(redis.store.get("vfs:default:ver:sbx")?.value).toBe("1"); + }); + + it("(e) tombstone is NOT written when destroySandboxFn throws (transient DB failure)", async () => { + const redis = new FakeRedis(); + const stub = new StubCoherentFs(); + const sm = new SessionManager({ + createFs: makeFsFactory(stub), + destroySandboxFn: vi.fn().mockRejectedValue(new Error("transient PG error")), + redis: asRedis(redis), + }); + + await sm.withSession("default", "sbx", async () => { + stub.dirty = true; + }); + expect(sm.getSession("default", "sbx")).toBeDefined(); + + await expect(sm.destroy("default", "sbx")).rejects.toThrow("transient PG error"); + + // The sandbox still exists in Postgres (destroy rolled back). The version + // key must NOT contain the tombstone — otherwise warm sessions on other + // replicas would tear down with false 404s. + const raw = await redis.get("vfs:default:ver:sbx"); + expect(raw).not.toBe("DESTROYED"); + + // FS must still be disconnected (pool leak prevention). + expect(stub.disconnectCount).toBe(1); + }); + + it("primary guard fires even without Redis: ESANDBOXGONE is not swallowed by ensureFreshCache no-op", async () => { + // With no Redis, ensureFreshCache is a no-op, so the gone-sandbox guard is + // not exercised through it. This documents that the Redis-independent + // teardown lives at the reload() boundary — sql-fs throws ESANDBOXGONE and + // the next layer that calls reload() (ensureFreshCache, only when Redis is + // present) maps it. Without Redis there is a single replica, so a destroyed + // warm session cannot exist on a peer. Guard: errors.ts emits the code. + expect((createEsandboxgone("sbx") as Error & { code?: string }).code).toBe("ESANDBOXGONE"); + }); +}); diff --git a/src/api/tests/unit/session-manager.version-counter.test.ts b/src/api/tests/unit/session-manager.version-counter.test.ts index 2ee014f..bb522bc 100644 --- a/src/api/tests/unit/session-manager.version-counter.test.ts +++ b/src/api/tests/unit/session-manager.version-counter.test.ts @@ -47,9 +47,12 @@ class FakeRedis { return z; } - async set(key: string, value: string, _px: "PX", ms: number, _nx: "NX"): Promise<"OK" | null> { + async set(key: string, value: string, unit: "PX" | "EX", amount: number, nx?: "NX"): Promise<"OK" | null> { this.gc(); - if (this.store.has(key)) return null; + const ms = unit === "EX" ? amount * 1000 : amount; + // SET ... NX (lock acquire) fails if the key already exists. + if (nx === "NX" && this.store.has(key)) return null; + // F7 tombstone uses unconditional SET ... EX (no NX) — always overwrites. this.store.set(key, { value, expiresAt: Date.now() + ms }); return "OK"; } @@ -319,7 +322,7 @@ describe("SessionManager version counter (Phase D)", () => { expect(stub.reloadCount).toBe(0); }); - it("destroy deletes the version key", async () => { + it("destroy tombstones the version key (F7)", async () => { const redis = new FakeRedis(); const stub = new StubCoherentFs(); const sm = new SessionManager({ @@ -331,13 +334,16 @@ describe("SessionManager version counter (Phase D)", () => { await sm.withSession("default", "sbx", async () => { stub.dirty = true; }); - expect(redis.store.has("vfs:default:ver:sbx")).toBe(true); + expect(redis.store.get("vfs:default:ver:sbx")?.value).toBe("1"); await sm.destroy("default", "sbx"); - expect(redis.store.has("vfs:default:ver:sbx")).toBe(false); + // F7: the key is NOT deleted — it carries the DESTROYED sentinel so a warm + // session on another replica recognises the destroy instead of reading an + // absent key as version 0. + expect(redis.store.get("vfs:default:ver:sbx")?.value).toBe("DESTROYED"); }); - it("destroy on an unknown sandbox still deletes stale version key", async () => { + it("destroy on an unknown sandbox still tombstones the stale version key (F7)", async () => { const redis = new FakeRedis(); // Stale key left behind by a previous incarnation redis.store.set("vfs:default:ver:sbx", { value: "42", expiresAt: Date.now() + 60_000 }); @@ -349,7 +355,7 @@ describe("SessionManager version counter (Phase D)", () => { }); await sm.destroy("default", "sbx"); - expect(redis.store.has("vfs:default:ver:sbx")).toBe(false); + expect(redis.store.get("vfs:default:ver:sbx")?.value).toBe("DESTROYED"); }); it("withExistingSession applies the same version check", async () => { diff --git a/src/sql-fs/errors.ts b/src/sql-fs/errors.ts index 7a613b1..99b7979 100644 --- a/src/sql-fs/errors.ts +++ b/src/sql-fs/errors.ts @@ -60,6 +60,21 @@ export function createEreadonly(path: string, op: string): Error { return makeFsError("EREADONLY", `EREADONLY: read-only filesystem, ${op} '${path}'`, path); } +/** + * ESANDBOXGONE: the sandbox (or its root inode) no longer exists in the DB. + * + * Raised by `SqlFs.#loadFreshPathCache` when `loadAllPaths` returns zero rows — + * the recursive CTE anchor joins `sandboxes` → root `inodes`, so an empty result + * means the sandbox/root was destroyed (F7). The caller (`ready`/`reload`) must + * NOT install an empty pathCache (which would serve ghost ENOENTs for every + * path); instead the session manager catches this, tears the warm session down, + * and surfaces a clean ENOENT → 404 to the client. Distinct from ENOENT so the + * teardown path is unambiguous and never confused with a single missing file. + */ +export function createEsandboxgone(sandboxId: string): Error { + return makeFsError("ESANDBOXGONE", `ESANDBOXGONE: sandbox no longer exists, '${sandboxId}'`); +} + // ── Sensitive-pattern stripping ─────────────────────────────────────────────── /** Patterns whose matches are replaced with [redacted] in sanitized error messages. */ diff --git a/src/sql-fs/redis-path-snapshot.ts b/src/sql-fs/redis-path-snapshot.ts index 3c55804..c515c08 100644 --- a/src/sql-fs/redis-path-snapshot.ts +++ b/src/sql-fs/redis-path-snapshot.ts @@ -18,6 +18,19 @@ export function versionKey(tenantId: string, sandboxId: string): string { return `vfs:${tenantId}:ver:${sandboxId}`; } +/** + * Tombstone sentinel written to the version key on `destroy` (F7) instead of + * `DEL`. A warm session on another replica that probes the counter sees this + * sentinel BEFORE the numeric parse and tears itself down, even in the + * never-written variant where its `lastSeenVersion` is still `0` and an absent + * (DEL'd) key would also read as `0` (`0 === 0` → no reload would ever fire). + * + * It is a DISTINCT string — NOT `-1`, which is the publish-failure marker + * (`publishVersionIfDirty`) — and NOT numeric, so `Number(raw) || 0` clamps it + * to `0` and must therefore be recognised explicitly at every clamp site. + */ +export const VERSION_TOMBSTONE = "DESTROYED"; + /** * Bump when the on-the-wire layout of `Snapshot` or `EncodedEntry` changes. * Old snapshots with a mismatched version are treated as a miss (Edge Case §8). diff --git a/src/sql-fs/sql-fs.ts b/src/sql-fs/sql-fs.ts index 20541ef..05f3955 100644 --- a/src/sql-fs/sql-fs.ts +++ b/src/sql-fs/sql-fs.ts @@ -23,9 +23,10 @@ import { createEnotempty, createEperm, createEreadonly, + createEsandboxgone, } from "./errors.js"; import type { RedisBlobCache } from "./redis-blob-cache.js"; -import { type RedisPathSnapshot, versionKey } from "./redis-path-snapshot.js"; +import { type RedisPathSnapshot, VERSION_TOMBSTONE, versionKey } from "./redis-path-snapshot.js"; import { type BulkIngestFile, INODE_KIND, type PathCacheEntry, type SqlDialect } from "./types.js"; /** @@ -469,7 +470,12 @@ export class SqlFs implements ICoherentFs, IReadOnlyScopeFs { if (this.#redis !== undefined && this.#pathSnapshot !== undefined) { try { const raw = await this.#redis.get(versionKey(this.#tenantId, this.#sandboxId)); - const currentVersion = raw === null ? 0 : Number(raw) || 0; + // F7: a tombstone must never be coerced to 0 by `Number(raw) || 0` + // and then falsely match a `version: 0` snapshot. Treat it as a hard + // mismatch (`-1`, an impossible snapshot version) so we fall through + // to `loadAllPaths`, which throws ESANDBOXGONE for the (now absent) + // sandbox. + const currentVersion = raw === VERSION_TOMBSTONE ? -1 : raw === null ? 0 : Number(raw) || 0; const snap = await this.#pathSnapshot.read(this.#tenantId, this.#sandboxId); if (snap === null) { missReason = "no_key"; @@ -632,6 +638,19 @@ export class SqlFs implements ICoherentFs, IReadOnlyScopeFs { const p = (async (): Promise => { try { const { entries } = await this.#loadFreshPathCache(); + // F7: a cross-replica reload that comes back EMPTY means the sandbox + // was destroyed on another replica — the recursive CTE anchor joins + // `sandboxes` → root `inode`, so a live sandbox always yields at least + // its root dir. Refuse to install an empty pathCache (which would + // serve ghost ENOENTs and break cwd resolution inside bash.exec). + // Throw ESANDBOXGONE WITHOUT clearing the existing caches so the + // session manager can tear the stale warm session down and surface a + // clean ENOENT → 404. (This guard lives in reload(), not ready(): + // ready() runs immediately after createSandbox, which guarantees a + // root row, whereas reload() refreshes a long-lived warm session.) + if (entries.size === 0) { + throw createEsandboxgone(this.#sandboxId); + } this.#cacheClear(); for (const [path, entry] of entries) this.#cacheSet(path, entry); this.#contentCache.clear(); diff --git a/src/sql-fs/tests/unit/sql-fs.reload-sandbox-gone.test.ts b/src/sql-fs/tests/unit/sql-fs.reload-sandbox-gone.test.ts new file mode 100644 index 0000000..66af063 --- /dev/null +++ b/src/sql-fs/tests/unit/sql-fs.reload-sandbox-gone.test.ts @@ -0,0 +1,101 @@ +/** + * F7 unit test: SqlFs.reload() must not install an empty pathCache when the + * sandbox has been destroyed on another replica. + * + * A live sandbox's recursive-CTE `loadAllPaths` always returns at least the + * root dir; zero rows means the sandbox/root is gone. reload() must throw + * ESANDBOXGONE (so the session manager tears the warm session down) and leave + * the prior caches intact rather than serving ghost ENOENTs. + */ + +import { describe, expect, it, vi } from "vitest"; +import { SqlFs } from "../../sql-fs.js"; +import type { PathCacheEntry, SqlDialect } from "../../types.js"; + +function makeDialect(): SqlDialect { + const transactionMock = vi.fn(async (fn: (tx: unknown) => Promise) => fn({})); + return { + connect: vi.fn(), + disconnect: vi.fn(), + transaction: transactionMock, + setSandboxContext: vi.fn(), + setSandboxContextWithLock: vi.fn(), + loadAllPaths: vi.fn(async () => []), + createSandbox: vi.fn(), + deleteSandbox: vi.fn(), + createInode: vi.fn(), + getInode: vi.fn(), + updateInode: vi.fn(), + deleteInode: vi.fn(), + incrementNlink: vi.fn(), + decrementNlink: vi.fn(), + insertDirent: vi.fn(), + upsertDirent: vi.fn(), + deleteDirent: vi.fn(), + listDirents: vi.fn(), + moveDirent: vi.fn(), + upsertBlob: vi.fn(), + getBlob: vi.fn(), + gcOrphanBlobs: vi.fn(), + getBlobsForSandbox: vi.fn(async () => []), + loadSubtreeInodes: vi.fn(), + bulkIngest: vi.fn(), + resolvePath: vi.fn(), + } as unknown as SqlDialect; +} + +const now = new Date("2026-01-01T00:00:00Z"); +const rootEntry: { path: string } & PathCacheEntry = { + path: "/", + inodeId: 1n, + kind: 2, + mode: 0o755, + size: 0, + mtime: now, + contentSha256: null, + symlinkTarget: null, +}; +const fileEntry: { path: string } & PathCacheEntry = { + path: "/file.txt", + inodeId: 10n, + kind: 1, + mode: 0o644, + size: 42, + mtime: now, + contentSha256: new Uint8Array(32), + symlinkTarget: null, +}; + +describe("SqlFs.reload — sandbox gone (F7)", () => { + it("throws ESANDBOXGONE when loadAllPaths returns zero rows on reload", async () => { + const dialect = makeDialect(); + const loadAllPaths = dialect.loadAllPaths as ReturnType; + // Initial ready() sees a live, populated sandbox. + loadAllPaths.mockResolvedValueOnce([rootEntry, fileEntry]); + const fs = new SqlFs({ dialect, sandboxId: "sbx" }); + await fs.ready(); + expect(await fs.exists("/file.txt")).toBe(true); + + // The sandbox is destroyed elsewhere → reload sees zero rows. + loadAllPaths.mockResolvedValueOnce([]); + await expect(fs.reload()).rejects.toMatchObject({ code: "ESANDBOXGONE" }); + + // The prior cache must be left intact — reload must NOT have installed an + // empty pathCache (which would serve ghost ENOENTs for everything). + expect(await fs.exists("/file.txt")).toBe(true); + }); + + it("reload succeeds normally when the sandbox still has rows", async () => { + const dialect = makeDialect(); + const loadAllPaths = dialect.loadAllPaths as ReturnType; + loadAllPaths.mockResolvedValueOnce([rootEntry, fileEntry]); + const fs = new SqlFs({ dialect, sandboxId: "sbx" }); + await fs.ready(); + + // Reload reflects a new file added by another replica. + const newEntry = { ...fileEntry, path: "/new.txt", inodeId: 11n }; + loadAllPaths.mockResolvedValueOnce([rootEntry, fileEntry, newEntry]); + await expect(fs.reload()).resolves.toBeUndefined(); + expect(await fs.exists("/new.txt")).toBe(true); + }); +}); diff --git a/thoughts/shared/plans/2026-06-13_f7-destroy-tombstone.md b/thoughts/shared/plans/2026-06-13_f7-destroy-tombstone.md new file mode 100644 index 0000000..b918f2d --- /dev/null +++ b/thoughts/shared/plans/2026-06-13_f7-destroy-tombstone.md @@ -0,0 +1,84 @@ +# F7 — Destroy must reach warm replicas: tombstone + sandbox-gone reload guard + +## Problem + +Destroying a sandbox on replica A does not reach warm sessions on other +replicas. Two failure modes: + +1. **Written session.** `deleteVersionKey` uses `redis.del`. Replica B's next + `ensureFreshCache` probe reads the (now absent) version counter as `0`, + mismatches its `lastSeenVersion`, and calls `reload()`. `reload()` runs + `loadAllPaths` against a deleted sandbox (0 rows), installs an EMPTY + pathCache, and serves ghost state — observably a non-zero exit + garbage + stderr inside an HTTP 200 exec (cwd resolution against an empty cache), + which is worse than a clean 404. +2. **Never-written session.** If B observed zero writes, `lastSeenVersion = 0`. + After destroy the absent counter also reads `0`, so `0 === 0` → **no reload + fires at all** → B serves its populated cache against a deleted sandbox. A + loadAllPaths-empty guard alone does NOT cover this; only a distinct sentinel + does (`0 !== sentinel`). + +## Fix to ship (layered) + +### Primary (Redis-independent) +`#loadFreshPathCache` detects a zero-row `loadAllPaths` (the postgres CTE anchor +returns 0 rows only when the sandbox/root is absent) and throws a typed +`ESANDBOXGONE` error instead of installing an empty cache. `reload()` propagates +it. `ensureFreshCache` catches `ESANDBOXGONE`, tears the warm session down +(mark closing, remove from the map, disconnect the PG pool), and rethrows as a +clean `ENOENT` → 404. + +### Secondary (tombstone) +`deleteVersionKey` writes a sentinel instead of deleting: +`redis.set(versionKey, 'DESTROYED', 'EX', VERSION_KEY_TTL_SECONDS)`. This covers +the never-written variant: `ensureFreshCache` recognizes the sentinel BEFORE the +numeric parse and tears down, even when `lastSeenVersion === 0`. The sentinel +string is `DESTROYED` — distinct from `-1` (the publish-failure marker). + +### Audit every `Number(raw) || 0` clamp site +- `session-manager.ts` `ensureFreshCache` (raw GETEX) — recognize sentinel before parse. +- `session-manager.ts` session creation `initialVersion` — recognize sentinel; a session created against a tombstoned sandbox must not start at `0`. +- `sql-fs.ts` `#loadFreshPathCache` snapshot version check — sentinel must not coerce to `0` and falsely match a `version:0` snapshot. + +## Discoveries + +- Line numbers in the issue predate batch-2. Located by symbol: + - `reload()` / `#loadFreshPathCache` → `src/sql-fs/sql-fs.ts` + - `loadAllPaths` CTE anchor → `src/sql-fs/dialects/postgres.ts:770` + - `deleteVersionKey` / `ensureFreshCache` / `initialVersion` → `src/api/session-manager.ts` +- `ENOENT` and `ESESSIONCLOSING` are already in `SAFE_FS_ERROR_CODES`; `ENOENT` → 404. We surface `ENOENT` (not a new code) so the route layer needs no change. +- Reaper teardown pattern (`session-manager.ts:1287`) is the template for `#tearDownGoneSession`: set `state="closing"`, `sessions.delete(key)`, `disconnectFs`. +- `mysql`/`azure-sql` dialects also implement `loadAllPaths`; the zero-row guard lives in `sql-fs.ts` (dialect-agnostic), so all dialects benefit. + +## Phases + +### Phase 1 — Primary: ESANDBOXGONE guard +- [ ] Add `createEsandboxgone(sandboxId)` to `src/sql-fs/errors.ts` (code `ESANDBOXGONE`). +- [ ] In `#loadFreshPathCache`, after the `loadAllPaths` query, if rows are empty throw `ESANDBOXGONE`. +- [ ] `reload()` lets the error propagate (no empty cache installed; old cache untouched by the load-before-clear ordering). +- [ ] `ensureFreshCache` catches `ESANDBOXGONE`, tears down the session, rethrows `ENOENT`. +- Automated: unit test (b)/(a) below. + +### Phase 2 — Secondary: tombstone sentinel +- [ ] `deleteVersionKey` → `redis.set(key, 'DESTROYED', 'EX', VERSION_KEY_TTL_SECONDS)`. +- [ ] Export `VERSION_TOMBSTONE` constant. +- [ ] `ensureFreshCache` recognizes the sentinel before the numeric parse → tear down + ENOENT. +- [ ] `initialVersion` (session creation) recognizes the sentinel. +- [ ] `#loadFreshPathCache` snapshot version check recognizes the sentinel (treat as mismatch). +- Automated: unit tests (b)/(c) below. + +### Phase 3 — Tests +- [ ] (a) reload with zero-row loadAllPaths throws ESANDBOXGONE (not empty cache); ensureFreshCache converts to teardown + ENOENT. +- [ ] (b) tombstone sentinel recognized → teardown. +- [ ] (c) never-written variant (initialVersion 0, post-destroy sentinel) still tears down. +- [ ] destroy writes the tombstone (not del); update the existing "destroy deletes the version key" test. + +## Success criteria + +### Automated +- `pnpm typecheck && pnpm lint:fix && pnpm test:unit` all pass. +- New unit tests above pass; existing tests not weakened. + +### Manual (LIVE) +- Create `vf7-*` sandbox → exec to populate → DELETE → re-exec/read → CLEAN 404/ENOENT (not 200 + garbage stderr). +- `redis-cli get vfs:...:ver:...` returns `DESTROYED`, not nil.