From 19a9568df066700ea72a3bcc06db1512a967fab4 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 12 Jun 2026 15:40:24 +0200 Subject: [PATCH 1/8] feat(orchestrator): allow skipping memory snapshot during pause Add Sandbox.Pause(WithFilesystemSnapshot()) to produce a snapshot that persists only the rootfs and skips the guest memory snapshot. The default remains a full memory snapshot; fs-only is strictly opt-in. When enabled, before pausing the VM a mandatory guest sync is run via envd (a failed sync fails the pause, since no memory snapshot preserves the page cache and the rootfs would otherwise be missing acknowledged writes), and memory prefetch is cleared. CreateSnapshot is still called for its disk drain+flush side effect, but the memfile diff is left empty (NoDiff) with a resolved-nil header. scheduling.FromHeaders now tolerates a nil memfile header, emitting rootfs-only scheduling metadata instead of dropping the metadata entirely. Snapshot.FilesystemSnapshot records the decision at pause time: it can't be inferred from the diff shape, since a memory snapshot with zero dirty pages also yields a NoDiff memfile but still needs its snapfile uploaded. Add -fs-only flag in cmd/resume-build to allow testing the operation locally. Snapshots taken with -fs-only should include only rootfs (meta)data. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Babis Chalios --- .../orchestrator/cmd/resume-build/main.go | 15 +- packages/orchestrator/pkg/sandbox/reclaim.go | 37 ++++ packages/orchestrator/pkg/sandbox/sandbox.go | 185 +++++++++++++----- packages/orchestrator/pkg/sandbox/snapshot.go | 7 + .../pkg/sandbox/template/storage_template.go | 20 +- .../orchestrator/pkg/scheduling/metadata.go | 35 ++-- .../pkg/scheduling/metadata_test.go | 80 ++++++++ 7 files changed, 310 insertions(+), 69 deletions(-) create mode 100644 packages/orchestrator/pkg/scheduling/metadata_test.go diff --git a/packages/orchestrator/cmd/resume-build/main.go b/packages/orchestrator/cmd/resume-build/main.go index ae82e9758a..71ecae4ee9 100644 --- a/packages/orchestrator/cmd/resume-build/main.go +++ b/packages/orchestrator/cmd/resume-build/main.go @@ -71,6 +71,7 @@ func main() { cmdPause := flag.String("cmd-pause", "", "execute command in sandbox, then pause on success") cmdSignalPause := flag.String("cmd-signal-pause", "", "execute command in sandbox, then wait for SIGUSR1 before pausing") optimize := flag.Bool("optimize", false, "collect fresh prefetch mapping after pause (resumes snapshot to record page faults)") + fsOnly := flag.Bool("fs-only", false, "pause without a memory snapshot (filesystem-only; resume reboots the guest)") shell := flag.Bool("shell", false, "attach an interactive PTY shell via envd (no sshd required in the sandbox)") fphTimeoutMs := flag.Int("fph-timeout-ms", 0, "override free-page-hinting-config pause timeout LD flag (0 = use LD default)") @@ -162,6 +163,12 @@ func main() { if *optimize && *iterations > 0 { log.Fatal("-optimize is incompatible with -iterations (benchmarking doesn't upload)") } + if *fsOnly && !isPauseMode { + log.Fatal("-fs-only requires a pause flag (-pause, -signal-pause, -cmd-pause, or -cmd-signal-pause)") + } + if *fsOnly && (*optimize || *fphBench) { + log.Fatal("-fs-only is incompatible with -optimize and -fph-bench (no memory snapshot)") + } if *shell && (isCmdMode || isPauseMode || *iterations > 0) { log.Fatal("-shell can only be used in interactive mode (no -cmd, no pause flags, no -iterations)") @@ -198,6 +205,7 @@ func main() { newBuildID: outputBuildID, iterations: *iterations, optimize: *optimize, + fsOnly: *fsOnly, } runOpts := runOptions{ @@ -233,6 +241,7 @@ type pauseOptions struct { newBuildID string iterations int // for benchmarking pause (only with immediate) optimize bool + fsOnly bool } func (p pauseOptions) enabled() bool { @@ -676,8 +685,12 @@ func (r *runner) pauseOnce(ctx context.Context, opts pauseOptions, verbose bool) } // Pause and create snapshot + var pauseSnapshotOpts []sandbox.PauseOption + if opts.fsOnly { + pauseSnapshotOpts = append(pauseSnapshotOpts, sandbox.WithFilesystemSnapshot()) + } pauseStart := time.Now() - snapshot, err := sbx.Pause(ctx, newMeta, sandbox.SnapshotUseCasePause) + snapshot, err := sbx.Pause(ctx, newMeta, sandbox.SnapshotUseCasePause, pauseSnapshotOpts...) pauseDur := time.Since(pauseStart) totalDur := time.Since(t0) diff --git a/packages/orchestrator/pkg/sandbox/reclaim.go b/packages/orchestrator/pkg/sandbox/reclaim.go index bc8801161f..8048b7047c 100644 --- a/packages/orchestrator/pkg/sandbox/reclaim.go +++ b/packages/orchestrator/pkg/sandbox/reclaim.go @@ -119,6 +119,43 @@ func (s *Sandbox) bestEffortReclaim(ctx context.Context) { } } +// guestSync runs sync in the guest via envd so ext4 flushes dirty pages to the +// virtio disk. Mandatory before a filesystem-only pause: without a memory +// snapshot the guest page cache is lost, so callers must fail the pause on +// error instead of persisting a rootfs missing acknowledged writes. Unlike +// bestEffortReclaim's sync step (LD-flag gated, best-effort), this always runs +// and always reports failure. +func (s *Sandbox) guestSync(ctx context.Context) error { + const syncTimeout = 5 * time.Second + + ctx, span := tracer.Start(ctx, "envd-guest-sync") + defer span.End() + + rcCtx, cancel := context.WithTimeout(ctx, syncTimeout+reclaimOuterSlack) + defer cancel() + + stream, err := s.StartEnvdSystemShell(rcCtx, "/bin/sh", []string{"-c", "sync"}, "root", syncTimeout) + if err != nil { + return fmt.Errorf("start guest sync: %w", err) + } + defer stream.Close() + + exitCode := int32(-1) + for stream.Receive() { + if end := stream.Msg().GetEvent().GetEnd(); end != nil { + exitCode = end.GetExitCode() + } + } + if err := stream.Err(); err != nil { + return fmt.Errorf("guest sync stream: %w", err) + } + if exitCode != 0 { + return fmt.Errorf("guest sync exited with code %d", exitCode) + } + + return nil +} + // envdSupportsCgroupFreeze reports whether the sandbox's envd exposes the // native /freeze and /unfreeze endpoints. Bad version strings log and return // false so we never accidentally call an unsupported endpoint. diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index 860fed4ef7..f541fc4002 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -1089,6 +1089,20 @@ func (s *Sandbox) Shutdown(ctx context.Context) error { return nil } +type pauseOptions struct { + filesystemSnapshot bool +} + +type PauseOption func(*pauseOptions) + +// WithFilesystemSnapshot makes the pause produce a filesystem-only snapshot: +// guest memory is not snapshotted, only the filesystem (rootfs) is persisted. +// Resuming such a snapshot reboots the guest instead of restoring memory state. +// The default (no option) is a full memory snapshot. +func WithFilesystemSnapshot() PauseOption { + return func(o *pauseOptions) { o.filesystemSnapshot = true } +} + // Pause creates a snapshot of the sandbox. // // Currently the memory snapshotting works like this: @@ -1100,12 +1114,25 @@ func (s *Sandbox) Shutdown(ctx context.Context) error { // that returns info about resident memory pages and about empty memory pages. // 5. Base on the info from the custom FC endpoint or from Uffd we copy the pages directly from the FC process to a local cache. // 6. We then can either close the sandbox or resume it. +// +// With WithFilesystemSnapshot(), steps 3-5 are skipped: a guest sync flushes +// the page cache to disk before pause, CreateSnapshot is still called for its +// disk drain+flush side effect (the snapfile is not uploaded), and the memfile +// diff is empty (NoDiff). func (s *Sandbox) Pause( ctx context.Context, m metadata.Template, useCase SnapshotUseCase, + opts ...PauseOption, ) (st *Snapshot, e error) { - ctx, span := tracer.Start(ctx, "sandbox-snapshot") + var pauseOpts pauseOptions + for _, opt := range opts { + opt(&pauseOpts) + } + + ctx, span := tracer.Start(ctx, "sandbox-snapshot", trace.WithAttributes( + attribute.Bool("fs-only-snapshot", pauseOpts.filesystemSnapshot), + )) defer span.End() cleanup := NewCleanup() @@ -1145,6 +1172,19 @@ func (s *Sandbox) Pause( return nil }) + if pauseOpts.filesystemSnapshot { + // FC never flushes the guest page cache and no memory snapshot will + // preserve it, so a failed sync would persist a rootfs missing + // acknowledged writes. Mandatory, unlike the best-effort reclaim above. + // The unfreeze cleanup is already registered, so failing here can't + // leave the live VM frozen. + if err := s.guestSync(ctx); err != nil { + return nil, fmt.Errorf("guest sync before filesystem-only pause: %w", err) + } + // Memory prefetch refers to the memfile, which is not persisted. + m.Prefetch = nil + } + // Drain free-page-hinting before pause so the snapshot doesn't capture // pages the guest already considers free. Timeout per use case; 0 disables. if t := featureflags.GetFreePageHintingTimeout(ctx, s.featureFlags, string(useCase), sandboxLDContext(s.Runtime, s.Config)); t > 0 { @@ -1173,57 +1213,29 @@ func (s *Sandbox) Pause( } // Gather data for postprocessing - originalMemfile, err := s.Template.Memfile(ctx) - if err != nil { - return nil, fmt.Errorf("failed to get original memfile: %w", err) - } - originalRootfs, err := s.Template.Rootfs() if err != nil { return nil, fmt.Errorf("failed to get original rootfs: %w", err) } - memfileDiffMetadata, err := s.Resources.memory.DiffMetadata(ctx, s.process) - if err != nil { - return nil, fmt.Errorf("failed to get memfile metadata: %w", err) - } - recordSnapshotDiff(ctx, "memfile", memfileDiffMetadata, originalMemfile.Header()) - // Start POSTPROCESSING - var dedupBase block.ReadonlyDevice - var dedupBestEffort, dedupDirectIO bool - var dedupBudget block.DedupBudget - dedupCfg := s.featureFlags.JSONFlag(ctx, featureflags.MemfileDiffDedupFlag, sandboxLDContext(s.Runtime, s.Config)).AsValueMap() - if dedupCfg.Get("enabled").BoolValue() { - dedupBase = originalMemfile - dedupBestEffort = dedupCfg.Get("bestEffort").BoolValue() - dedupDirectIO = dedupCfg.Get("directIO").BoolValue() - dedupBudget = block.DedupBudget{ - MaxFetchWindowsPerBlock: dedupCfg.Get("maxFetchWindowsPerBlock").IntValue(), - MaxPromotedParentPagesPerBlock: dedupCfg.Get("maxPromotedParentPagesPerBlock").IntValue(), - MaxPagesPerPromotedFrame: dedupCfg.Get("maxPagesPerPromotedFrame").IntValue(), - BlockFaultPct: dedupCfg.Get("blockFaultPct").IntValue(), - FetchRunWindowPages: dedupCfg.Get("fetchRunWindowPages").IntValue(), + // + // For a filesystem-only pause the memory snapshot is skipped entirely: the + // memfile diff stays NoDiff with no header, and the memfile-derived fields + // stay zero so the snapshot and scheduling metadata carry rootfs only. + mem := memorySnapshot{ + diff: build.Diff(&build.NoDiff{}), + diffHeader: NewResolvedDiffHeader(nil), + } + if !pauseOpts.filesystemSnapshot { + mem, err = s.processMemorySnapshot(ctx, buildID) + if err != nil { + return nil, err } } - memfileDiff, memfileDiffHeader, err := pauseProcessMemory( - ctx, - buildID, - originalMemfile.Header(), - memfileDiffMetadata, - s.config.DefaultCacheDir, - s.process, - s.memory.Memfd(ctx), - s.featureFlags.BoolFlag(ctx, featureflags.MemfdBackgroundCopyFlag, sandboxLDContext(s.Runtime, s.Config)), - dedupBase, - dedupBestEffort, - dedupDirectIO, - dedupBudget, - ) - if err != nil { - return nil, fmt.Errorf("error while post processing: %w", err) - } - cleanup.AddNoContext(ctx, memfileDiff.Close) + // NoDiff.Close is a no-op, so registering it for the filesystem-only case is + // harmless and keeps the cleanup ordering identical to the memory path. + cleanup.AddNoContext(ctx, mem.diff.Close) rootfsDiff, rootfsHeader, err := pauseProcessRootfs( ctx, @@ -1249,8 +1261,8 @@ func (s *Sandbox) Pause( // base-identical ones, so it over-estimates. The rootfs copy is synchronous // today, so its new header carries the exact rootfs chain and bytes; if it // ever becomes async, switch it to the parent plus a dirty proxy like memfile. - newMemfileBytes := memfileDiffMetadata.Dirty.GetCardinality() * uint64(memfileDiffMetadata.BlockSize) - schedulingMetadata := scheduling.FromHeaders(buildID, originalMemfile.Header(), rootfsHeader, newMemfileBytes) + // mem.header is nil for a filesystem-only pause → rootfs-only metadata. + schedulingMetadata := scheduling.FromHeaders(buildID, mem.header, rootfsHeader, mem.newBytes) metadataFileLink := template.NewLocalFileLink(cachePaths.CacheMetadata()) cleanup.AddNoContext(ctx, metadataFileLink.Close) @@ -1263,12 +1275,13 @@ func (s *Sandbox) Pause( return &Snapshot{ Snapfile: snapfile, Metafile: metadataFileLink, - MemfileDiff: memfileDiff, - MemfileDiffHeader: memfileDiffHeader, + MemfileDiff: mem.diff, + MemfileDiffHeader: mem.diffHeader, RootfsDiff: rootfsDiff, RootfsDiffHeader: rootfsDiffHeader, SchedulingMetadata: schedulingMetadata, - MemfileBlockSize: originalMemfile.Header().Metadata.BlockSize, + FilesystemSnapshot: pauseOpts.filesystemSnapshot, + MemfileBlockSize: mem.blockSize, RootfsBlockSize: originalRootfs.Header().Metadata.BlockSize, BuildID: buildID, @@ -1277,6 +1290,80 @@ func (s *Sandbox) Pause( }, nil } +// memorySnapshot holds the products of memory postprocessing during a +// Pause. For a filesystem-only pause it is left zero-valued except for an empty +// NoDiff and its resolved-nil header. +type memorySnapshot struct { + diff build.Diff + diffHeader *DiffHeader + header *header.Header + blockSize uint64 + // newBytes is the pre-dedup dirty-byte upper bound passed to scheduling + // metadata for the new layer. + newBytes uint64 +} + +// processMemorySnapshot copies the dirty guest memory pages into a local diff +// and builds its header — steps 3-5 of Pause. Only called for a full memory +// snapshot; a filesystem-only pause skips it. The returned diff's Close must be +// registered for cleanup by the caller. +func (s *Sandbox) processMemorySnapshot(ctx context.Context, buildID uuid.UUID) (memorySnapshot, error) { + originalMemfile, err := s.Template.Memfile(ctx) + if err != nil { + return memorySnapshot{}, fmt.Errorf("failed to get original memfile: %w", err) + } + memfileHeader := originalMemfile.Header() + + memfileDiffMetadata, err := s.Resources.memory.DiffMetadata(ctx, s.process) + if err != nil { + return memorySnapshot{}, fmt.Errorf("failed to get memfile metadata: %w", err) + } + recordSnapshotDiff(ctx, "memfile", memfileDiffMetadata, memfileHeader) + + var dedupBase block.ReadonlyDevice + var dedupBestEffort, dedupDirectIO bool + var dedupBudget block.DedupBudget + dedupCfg := s.featureFlags.JSONFlag(ctx, featureflags.MemfileDiffDedupFlag, sandboxLDContext(s.Runtime, s.Config)).AsValueMap() + if dedupCfg.Get("enabled").BoolValue() { + dedupBase = originalMemfile + dedupBestEffort = dedupCfg.Get("bestEffort").BoolValue() + dedupDirectIO = dedupCfg.Get("directIO").BoolValue() + dedupBudget = block.DedupBudget{ + MaxFetchWindowsPerBlock: dedupCfg.Get("maxFetchWindowsPerBlock").IntValue(), + MaxPromotedParentPagesPerBlock: dedupCfg.Get("maxPromotedParentPagesPerBlock").IntValue(), + MaxPagesPerPromotedFrame: dedupCfg.Get("maxPagesPerPromotedFrame").IntValue(), + BlockFaultPct: dedupCfg.Get("blockFaultPct").IntValue(), + FetchRunWindowPages: dedupCfg.Get("fetchRunWindowPages").IntValue(), + } + } + + memfileDiff, memfileDiffHeader, err := pauseProcessMemory( + ctx, + buildID, + memfileHeader, + memfileDiffMetadata, + s.config.DefaultCacheDir, + s.process, + s.memory.Memfd(ctx), + s.featureFlags.BoolFlag(ctx, featureflags.MemfdBackgroundCopyFlag, sandboxLDContext(s.Runtime, s.Config)), + dedupBase, + dedupBestEffort, + dedupDirectIO, + dedupBudget, + ) + if err != nil { + return memorySnapshot{}, fmt.Errorf("error while post processing: %w", err) + } + + return memorySnapshot{ + diff: memfileDiff, + diffHeader: memfileDiffHeader, + header: memfileHeader, + blockSize: memfileHeader.Metadata.BlockSize, + newBytes: memfileDiffMetadata.Dirty.GetCardinality() * uint64(memfileDiffMetadata.BlockSize), + }, nil +} + // FlushAndReadBalloonMetrics triggers an FC metrics flush and returns the // updated cumulative virtio-balloon counters. Used by the FPH bench. func (s *Sandbox) FlushAndReadBalloonMetrics(ctx context.Context) (fc.BalloonMetricsSnapshot, error) { diff --git a/packages/orchestrator/pkg/sandbox/snapshot.go b/packages/orchestrator/pkg/sandbox/snapshot.go index 5c4ef05729..4cd81e7bff 100644 --- a/packages/orchestrator/pkg/sandbox/snapshot.go +++ b/packages/orchestrator/pkg/sandbox/snapshot.go @@ -36,6 +36,13 @@ type Snapshot struct { BuildID uuid.UUID SchedulingMetadata *orchestrator.SchedulingMetadata + // FilesystemSnapshot is true for filesystem-only snapshots: the memfile diff + // is empty (NoDiff) and the memfile, memfile header, and snapfile are not + // uploaded. It records the decision made at pause time, which can't be + // inferred from the diff shape — a memory snapshot with zero dirty pages also + // produces a NoDiff memfile but still needs its snapfile uploaded. + FilesystemSnapshot bool + // Template block sizes captured sync at Pause time. They equal // MemfileDiffHeader.Metadata.BlockSize once that header resolves, but // are needed sync by NewUpload's compression validation — the dedup diff --git a/packages/orchestrator/pkg/sandbox/template/storage_template.go b/packages/orchestrator/pkg/sandbox/template/storage_template.go index 955578cc11..7601b96198 100644 --- a/packages/orchestrator/pkg/sandbox/template/storage_template.go +++ b/packages/orchestrator/pkg/sandbox/template/storage_template.go @@ -276,18 +276,28 @@ func (t *storageTemplate) Files() storage.CachePaths { // rather than the header holders, which stay unset for templates loaded from // storage (the headers are resolved internally by NewStorage during Fetch). func (t *storageTemplate) SchedulingMetadata(ctx context.Context) *orchestrator.SchedulingMetadata { - memfile, memfileErr := t.memfile.WaitWithContext(ctx) + // The rootfs is always present; its header carries the build ID and is the + // minimum needed for scheduling metadata. rootfs, rootfsErr := t.rootfs.WaitWithContext(ctx) - if memfileErr != nil || rootfsErr != nil { + if rootfsErr != nil { return nil } - mh := memfile.Header() - if mh == nil || mh.Metadata == nil { + rh := rootfs.Header() + if rh == nil || rh.Metadata == nil { return nil } - return scheduling.FromHeaders(mh.Metadata.BuildId, mh, rootfs.Header(), 0) + // Filesystem-only snapshots have no memfile object, so memfile.WaitWithContext + // errors on reload. Tolerate that and report rootfs-only scheduling metadata + // (FromHeaders treats a nil memfile header as rootfs-only) instead of + // dropping the rootfs affinity data too. + var mh *header.Header + if memfile, memfileErr := t.memfile.WaitWithContext(ctx); memfileErr == nil { + mh = memfile.Header() + } + + return scheduling.FromHeaders(rh.Metadata.BuildId, mh, rh, 0) } func (t *storageTemplate) Memfile(ctx context.Context) (block.ReadonlyDevice, error) { diff --git a/packages/orchestrator/pkg/scheduling/metadata.go b/packages/orchestrator/pkg/scheduling/metadata.go index de6111ef4f..94ea7cad25 100644 --- a/packages/orchestrator/pkg/scheduling/metadata.go +++ b/packages/orchestrator/pkg/scheduling/metadata.go @@ -20,28 +20,35 @@ const chainLimit = 128 // derives the rest from the resolved parent headers and passes newMemfileBytes // (a pre-dedup upper bound) for the new layer. Lists are sorted by build ID — // order is not significant for affinity matching. +// +// A nil memfileHeader (filesystem-only snapshot) yields rootfs-only metadata +// with empty memfile fields; the rootfs header is always required. func FromHeaders(buildID uuid.UUID, memfileHeader, rootfsHeader *header.Header, newMemfileBytes uint64) *orchestrator.SchedulingMetadata { - if memfileHeader == nil || memfileHeader.Metadata == nil || rootfsHeader == nil || rootfsHeader.Metadata == nil { + if rootfsHeader == nil || rootfsHeader.Metadata == nil { return nil } - memfileBase := memfileHeader.Metadata.BaseBuildId rootfsBase := rootfsHeader.Metadata.BaseBuildId - - memIDs, memBytes, memDropped := artifactBuilds(memfileHeader, memfileBase, buildID, newMemfileBytes) rootIDs, rootBytes, rootDropped := artifactBuilds(rootfsHeader, rootfsBase, buildID, 0) - return &orchestrator.SchedulingMetadata{ - MemfileBaseBuildId: memfileBase.String(), - RootfsBaseBuildId: rootfsBase.String(), - BuildId: buildID.String(), - MemfileBuildIds: memIDs, - RootfsBuildIds: rootIDs, - MemfileBuildBytes: memBytes, - RootfsBuildBytes: rootBytes, - MemfileDroppedBuilds: uint32(memDropped), - RootfsDroppedBuilds: uint32(rootDropped), + md := &orchestrator.SchedulingMetadata{ + RootfsBaseBuildId: rootfsBase.String(), + BuildId: buildID.String(), + RootfsBuildIds: rootIDs, + RootfsBuildBytes: rootBytes, + RootfsDroppedBuilds: uint32(rootDropped), + } + + if memfileHeader != nil && memfileHeader.Metadata != nil { + memfileBase := memfileHeader.Metadata.BaseBuildId + memIDs, memBytes, memDropped := artifactBuilds(memfileHeader, memfileBase, buildID, newMemfileBytes) + md.MemfileBaseBuildId = memfileBase.String() + md.MemfileBuildIds = memIDs + md.MemfileBuildBytes = memBytes + md.MemfileDroppedBuilds = uint32(memDropped) } + + return md } // artifactBuilds returns the build IDs referenced by the header and their diff --git a/packages/orchestrator/pkg/scheduling/metadata_test.go b/packages/orchestrator/pkg/scheduling/metadata_test.go new file mode 100644 index 0000000000..bb0e3b2fe6 --- /dev/null +++ b/packages/orchestrator/pkg/scheduling/metadata_test.go @@ -0,0 +1,80 @@ +package scheduling_test + +import ( + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/scheduling" + "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" +) + +func hdr(buildID, baseID uuid.UUID) *header.Header { + return &header.Header{ + Metadata: &header.Metadata{ + BuildId: buildID, + BaseBuildId: baseID, + BlockSize: 4096, + }, + } +} + +func TestFromHeaders(t *testing.T) { + t.Parallel() + + buildID := uuid.MustParse("11111111-1111-1111-1111-111111111111") + memBase := uuid.MustParse("22222222-2222-2222-2222-222222222222") + rootBase := uuid.MustParse("33333333-3333-3333-3333-333333333333") + + memHeader := hdr(buildID, memBase) + rootHeader := hdr(buildID, rootBase) + + t.Run("filesystem-only: nil memfile header yields rootfs-only metadata", func(t *testing.T) { + t.Parallel() + md := scheduling.FromHeaders(buildID, nil, rootHeader, 0) + + require.NotNil(t, md) + assert.Equal(t, buildID.String(), md.GetBuildId()) + assert.Equal(t, rootBase.String(), md.GetRootfsBaseBuildId()) + assert.NotEmpty(t, md.GetRootfsBuildIds()) + + // Memfile fields must be empty for a filesystem-only snapshot. + assert.Empty(t, md.GetMemfileBaseBuildId()) + assert.Empty(t, md.GetMemfileBuildIds()) + assert.Empty(t, md.GetMemfileBuildBytes()) + assert.Zero(t, md.GetMemfileDroppedBuilds()) + }) + + t.Run("memfile header with nil Metadata is treated as absent", func(t *testing.T) { + t.Parallel() + md := scheduling.FromHeaders(buildID, &header.Header{}, rootHeader, 0) + + require.NotNil(t, md) + assert.NotEmpty(t, md.GetRootfsBuildIds()) + assert.Empty(t, md.GetMemfileBaseBuildId()) + assert.Empty(t, md.GetMemfileBuildIds()) + }) + + t.Run("nil rootfs header yields nil", func(t *testing.T) { + t.Parallel() + assert.Nil(t, scheduling.FromHeaders(buildID, memHeader, nil, 0)) + }) + + t.Run("rootfs header with nil Metadata yields nil", func(t *testing.T) { + t.Parallel() + assert.Nil(t, scheduling.FromHeaders(buildID, memHeader, &header.Header{}, 0)) + }) + + t.Run("both headers present populate memfile and rootfs", func(t *testing.T) { + t.Parallel() + md := scheduling.FromHeaders(buildID, memHeader, rootHeader, 1024) + + require.NotNil(t, md) + assert.Equal(t, memBase.String(), md.GetMemfileBaseBuildId()) + assert.Equal(t, rootBase.String(), md.GetRootfsBaseBuildId()) + assert.NotEmpty(t, md.GetMemfileBuildIds()) + assert.NotEmpty(t, md.GetRootfsBuildIds()) + }) +} From 3f0fb5f2da7bceb2ec1d07d4312aa8739108e0d8 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 15 Jun 2026 12:23:22 +0200 Subject: [PATCH 2/8] fix(orchestrator): scale pre-pause guest-sync timeout with RAM + override flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mandatory pre-pause guest sync used a fixed 5s deadline, which is too short under IO contention — a failed sync fails the (filesystem-only) pause. The dirty page cache a sync must flush is bounded by guest RAM (pages already written back are not re-flushed), so scale the deadline by RamMB against a pessimistic 50 MiB/s flush-throughput floor, clamped to [5s, 2min]. Add the guest-sync-timeout-milliseconds feature flag as an operational override: a positive value pins the deadline regardless of RAM; 0 (default) keeps the RAM-derived value. Unit tests cover the scaling/clamping and the flag override/fallback. Addresses PR review feedback. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Babis Chalios --- packages/orchestrator/pkg/sandbox/reclaim.go | 53 +++++++++++++- .../orchestrator/pkg/sandbox/reclaim_test.go | 72 +++++++++++++++++++ packages/shared/pkg/featureflags/flags.go | 4 ++ 3 files changed, 128 insertions(+), 1 deletion(-) create mode 100644 packages/orchestrator/pkg/sandbox/reclaim_test.go diff --git a/packages/orchestrator/pkg/sandbox/reclaim.go b/packages/orchestrator/pkg/sandbox/reclaim.go index 8048b7047c..38570621b1 100644 --- a/packages/orchestrator/pkg/sandbox/reclaim.go +++ b/packages/orchestrator/pkg/sandbox/reclaim.go @@ -26,6 +26,24 @@ const reclaimOuterSlack = 500 * time.Millisecond // must stay independent of the reclaim shell deadline. const freezeTimeout = 2 * time.Second +const ( + // syncMinTimeout floors the guest-sync deadline; it covers small-RAM + // sandboxes and the shell round-trip. + syncMinTimeout = 5 * time.Second + + // syncMaxTimeout caps the guest-sync deadline so a stuck sync still fails the + // pause in bounded time rather than hanging it. + syncMaxTimeout = 2 * time.Minute + + // syncFlushFloorBytesPerSec is a pessimistic floor for guest page-cache + // flush throughput to the virtio disk under IO contention. The data a sync + // must flush is bounded by the dirty page cache (≈ guest RAM; pages already + // written back are not re-flushed), so the deadline scales with RAM against + // this floor. Conservative on purpose: too low only over-waits, while too + // high would falsely fail the (mandatory) pre-pause sync. + syncFlushFloorBytesPerSec = 50 * 1024 * 1024 +) + // buildReclaimScript builds the fstrim/sync/drop_caches/compact_memory chain. // Returns ("", 0) when every step is disabled. func (s *Sandbox) buildReclaimScript(cfg featureflags.ReclaimConfig) (string, time.Duration) { @@ -119,6 +137,39 @@ func (s *Sandbox) bestEffortReclaim(ctx context.Context) { } } +// ramScaledSyncTimeout derives the guest-sync deadline from guest RAM. The +// dirty page cache that sync must flush is bounded by RAM, divided by a +// pessimistic flush-throughput floor, then clamped to +// [syncMinTimeout, syncMaxTimeout]. +func ramScaledSyncTimeout(ramMB int64) time.Duration { + ramBytes := ramMB * 1024 * 1024 + d := time.Duration(ramBytes/syncFlushFloorBytesPerSec) * time.Second + + if d < syncMinTimeout { + return syncMinTimeout + } + if d > syncMaxTimeout { + return syncMaxTimeout + } + + return d +} + +// guestSyncTimeout returns the deadline for the pre-pause guest sync. The +// GuestSyncTimeoutMs feature flag pins it (milliseconds) when set to a positive +// value; otherwise it scales with guest RAM via ramScaledSyncTimeout. +func (s *Sandbox) guestSyncTimeout(ctx context.Context) time.Duration { + if ms := s.featureFlags.IntFlag(ctx, featureflags.GuestSyncTimeoutMs, + featureflags.SandboxContext(s.Runtime.SandboxID), + featureflags.TeamContext(s.Runtime.TeamID), + featureflags.TemplateContext(s.Runtime.TemplateID), + ); ms > 0 { + return time.Duration(ms) * time.Millisecond + } + + return ramScaledSyncTimeout(s.Config.RamMB) +} + // guestSync runs sync in the guest via envd so ext4 flushes dirty pages to the // virtio disk. Mandatory before a filesystem-only pause: without a memory // snapshot the guest page cache is lost, so callers must fail the pause on @@ -126,7 +177,7 @@ func (s *Sandbox) bestEffortReclaim(ctx context.Context) { // bestEffortReclaim's sync step (LD-flag gated, best-effort), this always runs // and always reports failure. func (s *Sandbox) guestSync(ctx context.Context) error { - const syncTimeout = 5 * time.Second + syncTimeout := s.guestSyncTimeout(ctx) ctx, span := tracer.Start(ctx, "envd-guest-sync") defer span.End() diff --git a/packages/orchestrator/pkg/sandbox/reclaim_test.go b/packages/orchestrator/pkg/sandbox/reclaim_test.go new file mode 100644 index 0000000000..d5325b5f1f --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/reclaim_test.go @@ -0,0 +1,72 @@ +//go:build linux + +package sandbox + +import ( + "testing" + "time" + + "github.com/launchdarkly/go-sdk-common/v3/ldvalue" + "github.com/launchdarkly/go-server-sdk/v7/testhelpers/ldtestdata" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/e2b-dev/infra/packages/shared/pkg/featureflags" +) + +func TestRamScaledSyncTimeout(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ramMB int64 + want time.Duration + }{ + // 128 MiB / 50 MiB/s ≈ 2.5s -> clamped up to the floor. + {"small RAM clamps to min", 128, syncMinTimeout}, + // 1024 MiB / 50 MiB/s ≈ 20s. + {"scales with RAM", 1024, 20 * time.Second}, + // 128 GiB / 50 MiB/s ≈ 2621s -> clamped down to the cap. + {"large RAM clamps to max", 128 * 1024, syncMaxTimeout}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, ramScaledSyncTimeout(tt.ramMB)) + }) + } +} + +func newSandboxWithFF(t *testing.T, ramMB int64, td *ldtestdata.TestDataSource) *Sandbox { + t.Helper() + + ff, err := featureflags.NewClientWithDatasource(td) + require.NoError(t, err) + t.Cleanup(func() { _ = ff.Close(t.Context()) }) + + s := &Sandbox{Metadata: &Metadata{Config: &Config{RamMB: ramMB}}} + s.featureFlags = ff + + return s +} + +func TestGuestSyncTimeout_FlagOverride(t *testing.T) { + t.Parallel() + + t.Run("positive flag pins the timeout regardless of RAM", func(t *testing.T) { + t.Parallel() + td := ldtestdata.DataSource() + td.Update(td.Flag(featureflags.GuestSyncTimeoutMs.Key()).ValueForAll(ldvalue.Int(30000))) + s := newSandboxWithFF(t, 1024, td) // RAM-derived would be 20s + + assert.Equal(t, 30*time.Second, s.guestSyncTimeout(t.Context())) + }) + + t.Run("unset flag falls back to RAM-derived", func(t *testing.T) { + t.Parallel() + s := newSandboxWithFF(t, 1024, ldtestdata.DataSource()) + + assert.Equal(t, ramScaledSyncTimeout(1024), s.guestSyncTimeout(t.Context())) + }) +} diff --git a/packages/shared/pkg/featureflags/flags.go b/packages/shared/pkg/featureflags/flags.go index 02b4ed9feb..6f928687d2 100644 --- a/packages/shared/pkg/featureflags/flags.go +++ b/packages/shared/pkg/featureflags/flags.go @@ -232,6 +232,10 @@ var ( BestOfKAlpha = NewIntFlag("best-of-k-alpha", 50) // Default Alpha=0.5 (stored as percentage for int flag, current usage weight) EnvdInitTimeoutMilliseconds = NewIntFlag("envd-init-request-timeout-milliseconds", 50) // Timeout for envd init request in milliseconds HostStatsSamplingInterval = NewIntFlag("host-stats-sampling-interval", 5000) // Host stats sampling interval in milliseconds (default 5s) + // GuestSyncTimeoutMs overrides the mandatory pre-pause guest-sync deadline + // for filesystem-only snapshots, in milliseconds. 0 (default) derives the + // timeout from guest RAM; a positive value pins it. + GuestSyncTimeoutMs = NewIntFlag("guest-sync-timeout-milliseconds", 0) MaxCacheWriterConcurrencyFlag = NewIntFlag("max-cache-writer-concurrency", 10) // BuildCacheMaxUsagePercentage the maximum percentage of the cache disk storage From 31c13b729b352110dabdbf00f20a2630cf627426 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 12 Jun 2026 15:42:40 +0200 Subject: [PATCH 3/8] refactor(orchestrator): bundle memfile snapshot fields into MemorySnapshot Export the memorySnapshot helper as MemorySnapshot and embed it in Snapshot as a single field, replacing the flat MemfileDiff / MemfileDiffHeader / MemfileBlockSize fields. The Diff, DiffHeader, and BlockSize fields are exported (read cross-package by the upload, layer, and server paths); the header and newBytes scheduling inputs stay unexported as they are used only at Pause time. Pure rename/restructure: no behavioral change. Consumers updated to read snap.MemorySnapshot.Diff / .DiffHeader / .BlockSize. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Babis Chalios --- .../orchestrator/pkg/sandbox/build_upload.go | 2 +- .../pkg/sandbox/build_upload_v3.go | 6 +-- .../pkg/sandbox/build_upload_v4.go | 4 +- packages/orchestrator/pkg/sandbox/sandbox.go | 54 ++++++++++--------- packages/orchestrator/pkg/sandbox/snapshot.go | 17 +++--- packages/orchestrator/pkg/server/sandboxes.go | 4 +- .../template/build/layer/layer_executor.go | 4 +- 7 files changed, 47 insertions(+), 44 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/build_upload.go b/packages/orchestrator/pkg/sandbox/build_upload.go index bb6a73b4b9..6b9e8ec575 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload.go +++ b/packages/orchestrator/pkg/sandbox/build_upload.go @@ -42,7 +42,7 @@ func NewUpload( useCase string, objectMetadata storage.ObjectMetadata, ) (*Upload, error) { - mem, memV4, err := resolveCompressConfig(ctx, cfg, ff, storage.MemfileName, snap.MemfileBlockSize, useCase) + mem, memV4, err := resolveCompressConfig(ctx, cfg, ff, storage.MemfileName, snap.MemorySnapshot.BlockSize, useCase) if err != nil { return nil, fmt.Errorf("resolve memfile compress config: %w", err) } diff --git a/packages/orchestrator/pkg/sandbox/build_upload_v3.go b/packages/orchestrator/pkg/sandbox/build_upload_v3.go index afc670cd9d..2a901c443d 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_v3.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_v3.go @@ -15,7 +15,7 @@ import ( ) func (u *Upload) runV3(ctx context.Context) error { - memfilePath, err := u.snap.MemfileDiff.CachePath(ctx) + memfilePath, err := u.snap.MemorySnapshot.Diff.CachePath(ctx) if err != nil { return fmt.Errorf("error getting memfile diff path: %w", err) } @@ -28,7 +28,7 @@ func (u *Upload) runV3(ctx context.Context) error { eg, egCtx := errgroup.WithContext(ctx) eg.Go(func() error { - h, err := u.snap.MemfileDiffHeader.WaitWithContext(egCtx) + h, err := u.snap.MemorySnapshot.DiffHeader.WaitWithContext(egCtx) if err != nil { return fmt.Errorf("wait memfile diff header: %w", err) } @@ -103,7 +103,7 @@ func (u *Upload) runV3(ctx context.Context) error { // Body uploads done; headers must be ready by now (the per-file Goroutines // above already Wait-ed). Wait() is a fast lookup here. - memfileDiffHeader, err := u.snap.MemfileDiffHeader.WaitWithContext(ctx) + memfileDiffHeader, err := u.snap.MemorySnapshot.DiffHeader.WaitWithContext(ctx) if err != nil { return fmt.Errorf("wait memfile diff header: %w", err) } diff --git a/packages/orchestrator/pkg/sandbox/build_upload_v4.go b/packages/orchestrator/pkg/sandbox/build_upload_v4.go index 572d02362a..3a017ceec1 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_v4.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_v4.go @@ -16,7 +16,7 @@ import ( ) func (u *Upload) runV4(ctx context.Context) error { - memSrc, err := u.snap.MemfileDiff.CachePath(ctx) + memSrc, err := u.snap.MemorySnapshot.Diff.CachePath(ctx) if err != nil { return fmt.Errorf("memfile diff path: %w", err) } @@ -29,7 +29,7 @@ func (u *Upload) runV4(ctx context.Context) error { eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { - h, err := u.snap.MemfileDiffHeader.WaitWithContext(ctx) + h, err := u.snap.MemorySnapshot.DiffHeader.WaitWithContext(ctx) if err != nil { return fmt.Errorf("wait memfile diff header: %w", err) } diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index f541fc4002..5ffa41994f 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -1223,9 +1223,9 @@ func (s *Sandbox) Pause( // For a filesystem-only pause the memory snapshot is skipped entirely: the // memfile diff stays NoDiff with no header, and the memfile-derived fields // stay zero so the snapshot and scheduling metadata carry rootfs only. - mem := memorySnapshot{ - diff: build.Diff(&build.NoDiff{}), - diffHeader: NewResolvedDiffHeader(nil), + mem := MemorySnapshot{ + Diff: build.Diff(&build.NoDiff{}), + DiffHeader: NewResolvedDiffHeader(nil), } if !pauseOpts.filesystemSnapshot { mem, err = s.processMemorySnapshot(ctx, buildID) @@ -1235,7 +1235,7 @@ func (s *Sandbox) Pause( } // NoDiff.Close is a no-op, so registering it for the filesystem-only case is // harmless and keeps the cleanup ordering identical to the memory path. - cleanup.AddNoContext(ctx, mem.diff.Close) + cleanup.AddNoContext(ctx, mem.Diff.Close) rootfsDiff, rootfsHeader, err := pauseProcessRootfs( ctx, @@ -1275,13 +1275,11 @@ func (s *Sandbox) Pause( return &Snapshot{ Snapfile: snapfile, Metafile: metadataFileLink, - MemfileDiff: mem.diff, - MemfileDiffHeader: mem.diffHeader, + MemorySnapshot: mem, RootfsDiff: rootfsDiff, RootfsDiffHeader: rootfsDiffHeader, SchedulingMetadata: schedulingMetadata, FilesystemSnapshot: pauseOpts.filesystemSnapshot, - MemfileBlockSize: mem.blockSize, RootfsBlockSize: originalRootfs.Header().Metadata.BlockSize, BuildID: buildID, @@ -1290,16 +1288,22 @@ func (s *Sandbox) Pause( }, nil } -// memorySnapshot holds the products of memory postprocessing during a -// Pause. For a filesystem-only pause it is left zero-valued except for an empty -// NoDiff and its resolved-nil header. -type memorySnapshot struct { - diff build.Diff - diffHeader *DiffHeader - header *header.Header - blockSize uint64 - // newBytes is the pre-dedup dirty-byte upper bound passed to scheduling - // metadata for the new layer. +// MemorySnapshot bundles the products of memory postprocessing during a Pause: +// the memfile diff, its (async-resolved) header, and the block size. It is +// embedded in Snapshot. For a filesystem-only pause it is zero-valued except for +// an empty NoDiff and a resolved-nil header (see Snapshot.FilesystemSnapshot). +type MemorySnapshot struct { + Diff build.Diff + DiffHeader *DiffHeader + // BlockSize is captured synchronously at Pause time because NewUpload's + // compression validation needs it before the async dedup header resolves; + // the dedup memfile path produces a page-granular Diff.BlockSize() that + // doesn't match the chunker-read granularity on restore. + BlockSize uint64 + + // header (base memfile) and newBytes (pre-dedup dirty-byte upper bound) are + // scheduling inputs consumed only at Pause time, so they stay unexported. + header *header.Header newBytes uint64 } @@ -1307,16 +1311,16 @@ type memorySnapshot struct { // and builds its header — steps 3-5 of Pause. Only called for a full memory // snapshot; a filesystem-only pause skips it. The returned diff's Close must be // registered for cleanup by the caller. -func (s *Sandbox) processMemorySnapshot(ctx context.Context, buildID uuid.UUID) (memorySnapshot, error) { +func (s *Sandbox) processMemorySnapshot(ctx context.Context, buildID uuid.UUID) (MemorySnapshot, error) { originalMemfile, err := s.Template.Memfile(ctx) if err != nil { - return memorySnapshot{}, fmt.Errorf("failed to get original memfile: %w", err) + return MemorySnapshot{}, fmt.Errorf("failed to get original memfile: %w", err) } memfileHeader := originalMemfile.Header() memfileDiffMetadata, err := s.Resources.memory.DiffMetadata(ctx, s.process) if err != nil { - return memorySnapshot{}, fmt.Errorf("failed to get memfile metadata: %w", err) + return MemorySnapshot{}, fmt.Errorf("failed to get memfile metadata: %w", err) } recordSnapshotDiff(ctx, "memfile", memfileDiffMetadata, memfileHeader) @@ -1352,14 +1356,14 @@ func (s *Sandbox) processMemorySnapshot(ctx context.Context, buildID uuid.UUID) dedupBudget, ) if err != nil { - return memorySnapshot{}, fmt.Errorf("error while post processing: %w", err) + return MemorySnapshot{}, fmt.Errorf("error while post processing: %w", err) } - return memorySnapshot{ - diff: memfileDiff, - diffHeader: memfileDiffHeader, + return MemorySnapshot{ + Diff: memfileDiff, + DiffHeader: memfileDiffHeader, + BlockSize: memfileHeader.Metadata.BlockSize, header: memfileHeader, - blockSize: memfileHeader.Metadata.BlockSize, newBytes: memfileDiffMetadata.Dirty.GetCardinality() * uint64(memfileDiffMetadata.BlockSize), }, nil } diff --git a/packages/orchestrator/pkg/sandbox/snapshot.go b/packages/orchestrator/pkg/sandbox/snapshot.go index 4cd81e7bff..9b90397960 100644 --- a/packages/orchestrator/pkg/sandbox/snapshot.go +++ b/packages/orchestrator/pkg/sandbox/snapshot.go @@ -27,8 +27,10 @@ func NewResolvedDiffHeader(h *header.Header) *DiffHeader { } type Snapshot struct { - MemfileDiff build.Diff - MemfileDiffHeader *DiffHeader + // MemorySnapshot bundles the memfile diff, its header, and block size. It is + // empty (NoDiff) for filesystem-only snapshots (see FilesystemSnapshot). + MemorySnapshot MemorySnapshot + RootfsDiff build.Diff RootfsDiffHeader *DiffHeader Snapfile template.File @@ -43,13 +45,10 @@ type Snapshot struct { // produces a NoDiff memfile but still needs its snapfile uploaded. FilesystemSnapshot bool - // Template block sizes captured sync at Pause time. They equal - // MemfileDiffHeader.Metadata.BlockSize once that header resolves, but - // are needed sync by NewUpload's compression validation — the dedup - // memfile path produces a page-granular Diff.BlockSize() that doesn't - // match the chunker-read granularity on restore. - MemfileBlockSize uint64 - RootfsBlockSize uint64 + // RootfsBlockSize is captured sync at Pause time — needed sync by NewUpload's + // compression validation. (The memfile block size lives in + // MemorySnapshot.BlockSize.) + RootfsBlockSize uint64 cleanup *Cleanup } diff --git a/packages/orchestrator/pkg/server/sandboxes.go b/packages/orchestrator/pkg/server/sandboxes.go index 08d4b8df44..d6b35fd3c7 100644 --- a/packages/orchestrator/pkg/server/sandboxes.go +++ b/packages/orchestrator/pkg/server/sandboxes.go @@ -878,11 +878,11 @@ func (s *Server) snapshotAndCacheSandbox( err = s.templateCache.AddSnapshot( ctx, meta.Template.BuildID, - snapshot.MemfileDiffHeader, + snapshot.MemorySnapshot.DiffHeader, snapshot.RootfsDiffHeader, snapshot.Snapfile, snapshot.Metafile, - snapshot.MemfileDiff, + snapshot.MemorySnapshot.Diff, snapshot.RootfsDiff, ) if err != nil { diff --git a/packages/orchestrator/pkg/template/build/layer/layer_executor.go b/packages/orchestrator/pkg/template/build/layer/layer_executor.go index ad1f0ad23a..443e82574c 100644 --- a/packages/orchestrator/pkg/template/build/layer/layer_executor.go +++ b/packages/orchestrator/pkg/template/build/layer/layer_executor.go @@ -271,11 +271,11 @@ func (lb *LayerExecutor) PauseAndUpload( err = lb.templateCache.AddSnapshot( context.WithoutCancel(ctx), meta.Template.BuildID, - snapshot.MemfileDiffHeader, + snapshot.MemorySnapshot.DiffHeader, snapshot.RootfsDiffHeader, snapshot.Snapfile, snapshot.Metafile, - snapshot.MemfileDiff, + snapshot.MemorySnapshot.Diff, snapshot.RootfsDiff, ) if err != nil { From eaaaae2b8e505a1564d7a91272445ad708050377 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 12 Jun 2026 15:50:06 +0200 Subject: [PATCH 4/8] feat(orchestrator): skip memory artifact uploads for fs-only snapshots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A filesystem-only snapshot now uploads only rootfs.ext4(.header) and metadata.json. The snapfile upload is skipped in both the V3 and V4 paths (it is still created during pause for its disk drain+flush side effect, just not persisted), and NewUpload skips resolving the memfile compress config — which would otherwise fail validation on the zero memfile block size when compression is enabled. The memfile body and header need no change: NoDiff.CachePath returns an empty path (body self-skips) and the resolved-nil diff header makes both upload paths return early. Absence of the memory artifacts on storage is the on-disk signal that a snapshot is filesystem-only. Memory snapshots are unaffected (FilesystemSnapshot is false). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Babis Chalios --- .../orchestrator/pkg/sandbox/build_upload.go | 14 ++++++-- .../pkg/sandbox/build_upload_test.go | 36 +++++++++++++++++++ .../pkg/sandbox/build_upload_v3.go | 6 ++++ .../pkg/sandbox/build_upload_v4.go | 6 ++++ packages/orchestrator/pkg/sandbox/sandbox.go | 3 ++ 5 files changed, 62 insertions(+), 3 deletions(-) diff --git a/packages/orchestrator/pkg/sandbox/build_upload.go b/packages/orchestrator/pkg/sandbox/build_upload.go index 6b9e8ec575..1c73b6ff8c 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload.go +++ b/packages/orchestrator/pkg/sandbox/build_upload.go @@ -42,9 +42,17 @@ func NewUpload( useCase string, objectMetadata storage.ObjectMetadata, ) (*Upload, error) { - mem, memV4, err := resolveCompressConfig(ctx, cfg, ff, storage.MemfileName, snap.MemorySnapshot.BlockSize, useCase) - if err != nil { - return nil, fmt.Errorf("resolve memfile compress config: %w", err) + // Filesystem-only snapshots have no memfile (NoDiff, block size 0), so + // resolving its compress config would fail validation ("block size must be + // positive"). The memfile body and header are never uploaded anyway. + var mem storage.CompressConfig + var memV4 bool + var err error + if !snap.FilesystemSnapshot { + mem, memV4, err = resolveCompressConfig(ctx, cfg, ff, storage.MemfileName, snap.MemorySnapshot.BlockSize, useCase) + if err != nil { + return nil, fmt.Errorf("resolve memfile compress config: %w", err) + } } root, rootV4, err := resolveCompressConfig(ctx, cfg, ff, storage.RootfsName, snap.RootfsBlockSize, useCase) if err != nil { diff --git a/packages/orchestrator/pkg/sandbox/build_upload_test.go b/packages/orchestrator/pkg/sandbox/build_upload_test.go index 7de081ad54..366ddd3824 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_test.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_test.go @@ -150,3 +150,39 @@ func TestAppendAncestorBuilds_NilDstSkipsSynthesis(t *testing.T) { err := u.appendAncestorBuilds(t.Context(), nil, mappingTo(t, 4096, ancestorID, 4096), build.Memfile) require.NoError(t, err) } + +// A filesystem-only snapshot has no memfile, so its MemorySnapshot.BlockSize is +// 0. NewUpload must skip resolving the memfile compress config for it — +// otherwise, with compression enabled, validateCompressConfig would reject the +// zero block size and fail the upload. FrameSizeKB is a multiple of the 4 KiB +// rootfs block so the rootfs config (which is always resolved) stays valid. +func TestNewUpload_FilesystemSnapshotSkipsMemfileCompressConfig(t *testing.T) { + t.Parallel() + + cfg := storage.CompressConfig{Enabled: true, Type: "zstd", FrameSizeKB: 256} + + t.Run("filesystem-only snapshot with zero memfile block size succeeds", func(t *testing.T) { + t.Parallel() + snap := &Snapshot{ + BuildID: uuid.New(), + FilesystemSnapshot: true, + RootfsBlockSize: 4096, + } + + u, err := NewUpload(t.Context(), nil, snap, nil, cfg, nil, storage.UseCaseBuild, storage.ObjectMetadata{}) + require.NoError(t, err) + require.NotNil(t, u) + }) + + t.Run("memory snapshot with zero memfile block size still errors", func(t *testing.T) { + t.Parallel() + snap := &Snapshot{ + BuildID: uuid.New(), + FilesystemSnapshot: false, + RootfsBlockSize: 4096, + } + + _, err := NewUpload(t.Context(), nil, snap, nil, cfg, nil, storage.UseCaseBuild, storage.ObjectMetadata{}) + require.Error(t, err) + }) +} diff --git a/packages/orchestrator/pkg/sandbox/build_upload_v3.go b/packages/orchestrator/pkg/sandbox/build_upload_v3.go index 2a901c443d..fe997e5c6b 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_v3.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_v3.go @@ -90,6 +90,12 @@ func (u *Upload) runV3(ctx context.Context) error { }) eg.Go(func() error { + // Filesystem-only snapshots resume by reboot, not snapfile restore, so + // the snapfile (created only for its disk-flush side effect) is not uploaded. + if u.snap.FilesystemSnapshot { + return nil + } + return uploadBlobWithMetrics(egCtx, u.store, u.paths.Snapfile(), storage.SnapfileObjectType, u.snap.Snapfile.Path(), uploadFileSnap, meta) }) diff --git a/packages/orchestrator/pkg/sandbox/build_upload_v4.go b/packages/orchestrator/pkg/sandbox/build_upload_v4.go index 3a017ceec1..b768754396 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_v4.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_v4.go @@ -55,6 +55,12 @@ func (u *Upload) runV4(ctx context.Context) error { meta := storage.WithMetadata(u.objectMetadata) eg.Go(func() error { + // Filesystem-only snapshots resume by reboot, not snapfile restore, so + // the snapfile (created only for its disk-flush side effect) is not uploaded. + if u.snap.FilesystemSnapshot { + return nil + } + return uploadBlobWithMetrics(ctx, u.store, u.paths.Snapfile(), storage.SnapfileObjectType, u.snap.Snapfile.Path(), uploadFileSnap, meta) }) diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index 5ffa41994f..0baf0f4946 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -1207,6 +1207,9 @@ func (s *Sandbox) Pause( snapfile := template.NewLocalFileLink(cachePaths.CacheSnapfile()) cleanup.AddNoContext(ctx, snapfile.Close) + // CreateSnapshot also drains and flushes the virtio disk in our custom FC, so + // it runs even for a filesystem-only pause (which needs the disk flush); the + // resulting snapfile is just not uploaded in that case. err = s.process.CreateSnapshot(ctx, snapfile.Path()) if err != nil { return nil, fmt.Errorf("error creating snapshot: %w", err) From a985c3aea2e642dd06351cd9c391746a0db4f8ed Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 12 Jun 2026 16:15:45 +0200 Subject: [PATCH 5/8] feat(orchestrator): add Factory.RebootSandbox cold-boot resume primitive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RebootSandbox cold-boots a fresh Firecracker VM from a snapshot's rootfs without restoring guest memory — the resume primitive for filesystem-only snapshots. It masks an empty memfile (sizing NoopMemory only), selects the NBD provider (empty rootfs cache path) so guest TRIM and overlay chaining work like a normal resume, uses the Sync IO engine, and boots via systemd init. The sandbox is marked running only after envd is ready, matching ResumeSandbox's routing guarantee; the caller's absolute end time is honored so queue delay can't extend the TTL. To support the deferred mark-running, CreateSandbox gains a WithDeferredMarkRunning option (cold boot otherwise marks running before envd is up). A StartTypeReboot start type is added, and the ext4 rootflags are pulled into a named constant documenting that "noload" must never be set — fs-only resume relies on ext4 replaying its journal on cold boot. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Babis Chalios --- .../orchestrator/pkg/sandbox/fc/process.go | 10 +- packages/orchestrator/pkg/sandbox/reboot.go | 119 ++++++++++++++++++ packages/orchestrator/pkg/sandbox/sandbox.go | 25 +++- 3 files changed, 151 insertions(+), 3 deletions(-) create mode 100644 packages/orchestrator/pkg/sandbox/reboot.go diff --git a/packages/orchestrator/pkg/sandbox/fc/process.go b/packages/orchestrator/pkg/sandbox/fc/process.go index 761a0f138d..050f4890b2 100644 --- a/packages/orchestrator/pkg/sandbox/fc/process.go +++ b/packages/orchestrator/pkg/sandbox/fc/process.go @@ -78,6 +78,13 @@ func (f *fcLogFilter) Write(p []byte) (n int, err error) { return len(p), err } +// ext4RootFlags are the ext4 mount flags passed on the kernel cmdline. +// discard: ext4 issues TRIM on freed blocks so they are elided from the +// snapshot diff. It must never include "noload": a filesystem-only snapshot +// resume cold-boots from the snapshot rootfs and relies on ext4 replaying the +// journal on mount. +const ext4RootFlags = "discard" + type ProcessOptions struct { // IoEngine is the io engine to use for the rootfs drive. IoEngine *string @@ -374,8 +381,7 @@ func (p *Process) Create( "i8042.noaux": "", "random.trust_cpu": "on", - // discard: ext4 issues TRIM on freed blocks so they are elided from the snapshot diff. - "rootflags": "discard", + "rootflags": ext4RootFlags, } if options.KvmClock { diff --git a/packages/orchestrator/pkg/sandbox/reboot.go b/packages/orchestrator/pkg/sandbox/reboot.go new file mode 100644 index 0000000000..abb29943cb --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/reboot.go @@ -0,0 +1,119 @@ +//go:build linux + +package sandbox + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/google/uuid" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/fc" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/template" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/template/constants" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/units" + "github.com/e2b-dev/infra/packages/shared/pkg/fc/models" + "github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator" + "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" + "github.com/e2b-dev/infra/packages/shared/pkg/utils" +) + +const ( + // minEnvdVersionForKVMClock is the minimum envd version that supports kvm-clock. + minEnvdVersionForKVMClock = "0.2.11" + + // rebootEnvdTimeout bounds the systemd boot + envd start; a cold boot needs a + // longer window than a memory resume (matches the template build's wait). + rebootEnvdTimeout = 60 * time.Second +) + +// RebootSandbox cold-boots a fresh Firecracker VM from the template's rootfs, +// without restoring guest memory. Used to resume filesystem-only snapshots: +// guest RAM, processes, and sockets are lost; only the filesystem survives. +// The sandbox is marked running only after envd is ready, matching +// ResumeSandbox's routing guarantees; endAt is the caller's absolute end time. +// IMPORTANT: You must Close() the sandbox after you are done with it. +func (f *Factory) RebootSandbox( + ctx context.Context, + t template.Template, + config *Config, + runtime RuntimeMetadata, + endAt time.Time, + apiConfigToStore *orchestrator.SandboxConfig, +) (*Sandbox, error) { + ctx, span := tracer.Start(ctx, "reboot sandbox") + defer span.End() + + buildID, err := uuid.Parse(t.Files().BuildID) + if err != nil { + return nil, fmt.Errorf("parse build ID: %w", err) + } + + // The masked empty memfile is used only for sizing NoopMemory — guest RAM + // is FC's own fresh anonymous memory. + pageSize := int64(header.PageSize) + if config.HugePages { + pageSize = int64(header.HugepageSize) + } + memfile, err := block.NewEmpty(units.MBToBytes(config.RamMB), pageSize, buildID) + if err != nil { + return nil, fmt.Errorf("create empty memfile: %w", err) + } + + maskedTemplate := template.NewMaskTemplate(t, template.WithMemfile(memfile)) + + kvmClock, err := utils.IsGTEVersion(config.Envd.Version, minEnvdVersionForKVMClock) + if err != nil { + return nil, fmt.Errorf("compare envd version: %w", err) + } + + // Sync IO engine so no async writes are in flight if the sandbox is paused again. + ioEngine := models.DriveIoEngineSync + + timeout := time.Until(endAt) + if timeout <= 0 { + return nil, fmt.Errorf("sandbox end time %s is in the past", endAt) + } + + sbx, err := f.CreateSandbox( + ctx, + config, + runtime, + maskedTemplate, + timeout, + // Empty rootfs cache path selects the NBD provider, same as a memory + // resume, so guest TRIM keeps working and a later pause exports the + // overlay diff exactly like a normal resume. + "", + fc.ProcessOptions{ + InitScriptPath: constants.SystemdInitPath, + KvmClock: kvmClock, + IoEngine: &ioEngine, + }, + apiConfigToStore, + nil, + WithDeferredMarkRunning(), + ) + if err != nil { + return nil, fmt.Errorf("create sandbox from rootfs: %w", err) + } + + // CreateSandbox anchors the lifetime to now; honor the caller's absolute end + // time so queue delay can't extend the TTL. + sbx.SetEndAt(endAt) + + if err := sbx.WaitForEnvd(ctx, StartTypeReboot, rebootEnvdTimeout); err != nil { + closeErr := sbx.Close(context.WithoutCancel(ctx)) + + return nil, errors.Join(fmt.Errorf("wait for envd after reboot: %w", err), closeErr) + } + + f.Sandboxes.MarkRunning(ctx, sbx) + + go sbx.Checks.Start(context.WithoutCancel(ctx)) + + return sbx, nil +} diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index 0baf0f4946..b61f39949a 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -61,6 +61,7 @@ var ( const ( StartTypeCreate = "create" // cold boot (template build) StartTypeResume = "resume" // resume from a snapshot (the common runtime path) + StartTypeReboot = "reboot" // cold boot from a snapshot rootfs (filesystem-only resume) ) var SandboxHttpTransport = otelhttp.NewTransport( @@ -348,6 +349,20 @@ func (f *Factory) EgressProxy() network.EgressProxy { // on the host side. type PreBootFn func(ctx context.Context, rootfsPath string) error +type createOptions struct { + deferMarkRunning bool +} + +type CreateOption func(*createOptions) + +// WithDeferredMarkRunning skips marking the sandbox running inside CreateSandbox +// so the caller can mark it only after envd is ready, matching ResumeSandbox. +// Used by the reboot path, where the guest is cold-booting and must not be +// routable until envd answers. +func WithDeferredMarkRunning() CreateOption { + return func(o *createOptions) { o.deferMarkRunning = true } +} + // CreateSandbox creates the sandbox. // IMPORTANT: You must Close() the sandbox after you are done with it. func (f *Factory) CreateSandbox( @@ -360,11 +375,17 @@ func (f *Factory) CreateSandbox( processOptions fc.ProcessOptions, apiConfigToStore *orchestrator.SandboxConfig, preBootFn PreBootFn, + opts ...CreateOption, ) (s *Sandbox, e error) { ctx, span := tracer.Start(ctx, "create sandbox") defer span.End() defer handleSpanError(span, &e) + var createOpts createOptions + for _, opt := range opts { + opt(&createOpts) + } + execCtx, execSpan := startExecutionSpan(ctx) exit := utils.NewErrorOnce() @@ -588,7 +609,9 @@ func (f *Factory) CreateSandbox( exit.SetError(errors.Join(err, fcErr)) }() - f.Sandboxes.MarkRunning(ctx, sbx) + if !createOpts.deferMarkRunning { + f.Sandboxes.MarkRunning(ctx, sbx) + } return sbx, nil } From 1dff17dd9eca05a1dbc245a1a298179cc0280513 Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 12 Jun 2026 16:26:01 +0200 Subject: [PATCH 6/8] fix(orchestrator): write MMDS access-token hash on cold-boot reboot A memory resume calls setMmds with the access-token hash (fc/process.go Resume) so the guest envd can authenticate /init against the MMDS hash. Cold boot (Process.Create) only configured the MMDS transport and never wrote the data, so a rebooted secure sandbox would start envd with no hash and /init would fall into the unauthenticated first-time-setup branch. Process.Create now writes the MMDS metadata before startVM when ProcessOptions.AccessToken is set, mirroring Resume. RebootSandbox always sets it; an empty token hashes to the "no token" value, matching Resume's behavior for non-secure sandboxes. Both the MMDS hash and the /init token sent by WaitForEnvd are sourced from config.Envd.AccessToken, so they always agree. Template-build cold boots leave AccessToken nil and are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Babis Chalios --- .../orchestrator/pkg/sandbox/fc/process.go | 29 +++++++++++++++++++ packages/orchestrator/pkg/sandbox/reboot.go | 9 ++++++ 2 files changed, 38 insertions(+) diff --git a/packages/orchestrator/pkg/sandbox/fc/process.go b/packages/orchestrator/pkg/sandbox/fc/process.go index 050f4890b2..cf28fe7ed9 100644 --- a/packages/orchestrator/pkg/sandbox/fc/process.go +++ b/packages/orchestrator/pkg/sandbox/fc/process.go @@ -102,6 +102,14 @@ type ProcessOptions struct { // KvmClock is a flag to enable kvm-clock as the clocksource for the kernel. KvmClock bool + // AccessToken, when non-nil, makes Create write the guest MMDS metadata + // (sandbox/template IDs, logs address, and the access-token hash) before the + // VM boots, so a cold-booted envd can authenticate /init the same way it does + // after a memory resume. An empty string hashes to the "no token" value, + // matching Resume. Template-build cold boots leave it nil and skip the write, + // preserving their existing behavior. + AccessToken *string + // Stdout is the writer to which the process stdout will be written. Stdout io.Writer @@ -469,6 +477,27 @@ func (p *Process) Create( ) } + // Write MMDS metadata before boot when an access token is provided (the + // cold-boot/reboot user path) so the guest envd can authenticate /init the + // same way it does after a memory resume. The MMDS transport is already + // configured by setNetworkInterface above. Template-build cold boots leave + // AccessToken nil and skip this, preserving their existing behavior. + if options.AccessToken != nil { + md := sbxMetadata.LoggerMetadata() + meta := &MmdsMetadata{ + SandboxID: md.SandboxID, + TemplateID: md.TemplateID, + LogsCollectorAddress: fmt.Sprintf("http://%s/logs", p.config.NetworkConfig.OrchestratorInSandboxIPAddress), + AccessTokenHash: keys.HashAccessToken(*options.AccessToken), + } + if err := p.client.setMmds(ctx, meta); err != nil { + fcStopErr := p.Stop(ctx) + + return errors.Join(fmt.Errorf("error setting mmds: %w", err), fcStopErr) + } + telemetry.ReportEvent(ctx, "set fc mmds metadata") + } + err = p.client.startVM(ctx) if err != nil { fcStopErr := p.Stop(ctx) diff --git a/packages/orchestrator/pkg/sandbox/reboot.go b/packages/orchestrator/pkg/sandbox/reboot.go index abb29943cb..10b542b5f6 100644 --- a/packages/orchestrator/pkg/sandbox/reboot.go +++ b/packages/orchestrator/pkg/sandbox/reboot.go @@ -73,6 +73,14 @@ func (f *Factory) RebootSandbox( // Sync IO engine so no async writes are in flight if the sandbox is paused again. ioEngine := models.DriveIoEngineSync + // Always write MMDS metadata for a reboot so the cold-booted envd can + // authenticate /init like a memory resume. An empty token hashes to the + // "no token" value, matching ResumeSandbox's behavior for non-secure sandboxes. + accessToken := "" + if config.Envd.AccessToken != nil { + accessToken = *config.Envd.AccessToken + } + timeout := time.Until(endAt) if timeout <= 0 { return nil, fmt.Errorf("sandbox end time %s is in the past", endAt) @@ -92,6 +100,7 @@ func (f *Factory) RebootSandbox( InitScriptPath: constants.SystemdInitPath, KvmClock: kvmClock, IoEngine: &ioEngine, + AccessToken: &accessToken, }, apiConfigToStore, nil, From 732a65551d4476f382b46d2a44cef9fb19815e0b Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Fri, 12 Jun 2026 16:16:26 +0200 Subject: [PATCH 7/8] test(orchestrator): wire -reboot into cmd/resume-build Add a -reboot flag that cold-boots from the build's rootfs via Factory.RebootSandbox instead of resuming from the memory snapshot, routed through a new runner.startSandbox helper used by the resume, interactive, cmd, and pause paths. Lets the filesystem-only pause + reboot resume cycle be exercised end to end locally (e.g. `resume-build -fs-only -cmd-pause '...'` then `resume-build -reboot -cmd 'cat /proc/uptime'`). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Babis Chalios --- .../orchestrator/cmd/resume-build/main.go | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/packages/orchestrator/cmd/resume-build/main.go b/packages/orchestrator/cmd/resume-build/main.go index 71ecae4ee9..92331a8e37 100644 --- a/packages/orchestrator/cmd/resume-build/main.go +++ b/packages/orchestrator/cmd/resume-build/main.go @@ -72,6 +72,7 @@ func main() { cmdSignalPause := flag.String("cmd-signal-pause", "", "execute command in sandbox, then wait for SIGUSR1 before pausing") optimize := flag.Bool("optimize", false, "collect fresh prefetch mapping after pause (resumes snapshot to record page faults)") fsOnly := flag.Bool("fs-only", false, "pause without a memory snapshot (filesystem-only; resume reboots the guest)") + reboot := flag.Bool("reboot", false, "cold-boot from the build's rootfs instead of resuming from memory") shell := flag.Bool("shell", false, "attach an interactive PTY shell via envd (no sshd required in the sandbox)") fphTimeoutMs := flag.Int("fph-timeout-ms", 0, "override free-page-hinting-config pause timeout LD flag (0 = use LD default)") @@ -219,7 +220,7 @@ func main() { } fphBenchOpts := fphBenchOptions{enabled: *fphBench, workload: *cmdPause, iterations: benchIters, delay: *fphBenchDelay} - err := run(ctx, *fromBuild, *iterations, *coldStart, *noPrefetch, *noEgress, *verbose, *shell, pauseOpts, runOpts, fphBenchOpts) + err := run(ctx, *fromBuild, *iterations, *coldStart, *noPrefetch, *noEgress, *verbose, *shell, *reboot, pauseOpts, runOpts, fphBenchOpts) cancel() if err != nil { @@ -336,10 +337,21 @@ type runner struct { coldStart bool noPrefetch bool shell bool + reboot bool config cfg.BuilderConfig storage storage.StorageProvider } +// startSandbox starts a sandbox from the build, either resuming from its memory +// snapshot or cold-booting (rebooting) from its rootfs when -reboot is set. +func (r *runner) startSandbox(ctx context.Context, runtime sandbox.RuntimeMetadata, start, end time.Time) (*sandbox.Sandbox, error) { + if r.reboot { + return r.factory.RebootSandbox(ctx, r.tmpl, r.sbxConfig, runtime, end, nil) + } + + return r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, start, end, nil) +} + func (r *runner) resumeOnce(ctx context.Context, iter int) (time.Duration, error) { runtime := sandbox.RuntimeMetadata{ TemplateID: r.buildID, @@ -349,7 +361,7 @@ func (r *runner) resumeOnce(ctx context.Context, iter int) (time.Duration, error } t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) dur := time.Since(t0) if sbx != nil { @@ -369,7 +381,7 @@ func (r *runner) interactive(ctx context.Context) error { fmt.Println("🚀 Starting...") t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) if err != nil { return err } @@ -419,7 +431,7 @@ func (r *runner) cmdOnce(ctx context.Context, opts runOptions, verbose bool) (cm fmt.Println("🚀 Starting sandbox...") } t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) resumeDur := time.Since(t0) if err != nil { return cmdTimings{resume: resumeDur, err: err}, err @@ -622,7 +634,7 @@ func (r *runner) pauseOnce(ctx context.Context, opts pauseOptions, verbose bool) fmt.Println("🚀 Starting sandbox...") } t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) resumeDur := time.Since(t0) if err != nil { return pauseTimings{resume: resumeDur, err: err}, err @@ -1039,7 +1051,7 @@ func (r *runner) benchmark(ctx context.Context, n int) error { return lastErr } -func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefetch, noEgress, verbose, shell bool, pauseOpts pauseOptions, runOpts runOptions, fphBenchOpts fphBenchOptions) error { +func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefetch, noEgress, verbose, shell, reboot bool, pauseOpts pauseOptions, runOpts runOptions, fphBenchOpts fphBenchOptions) error { // Silence other loggers unless verbose mode var l logger.Logger if !verbose { @@ -1204,6 +1216,7 @@ func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefe coldStart: coldStart, noPrefetch: noPrefetch, shell: shell, + reboot: reboot, config: config.BuilderConfig, storage: persistence, sbxConfig: sbxCfg, From 6a9519471c6738ebe4e8201291c4556d1395f7ce Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 15 Jun 2026 19:23:18 +0200 Subject: [PATCH 8/8] feat(fs-snap): add metric for guest sync latency Add a metric for measuring the latency of syncing the guest filesystem before taking a snapshot. Also, add a span attribute in `sandbox-snapshot` span that shows when a Pause() operation is on behalf of a filesystem-only snapshot. Signed-off-by: Babis Chalios --- packages/orchestrator/pkg/sandbox/reclaim.go | 13 ++++++++++++- packages/orchestrator/pkg/sandbox/sandbox.go | 1 + packages/shared/pkg/telemetry/meters.go | 3 +++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/orchestrator/pkg/sandbox/reclaim.go b/packages/orchestrator/pkg/sandbox/reclaim.go index 38570621b1..b7f7b9ef9a 100644 --- a/packages/orchestrator/pkg/sandbox/reclaim.go +++ b/packages/orchestrator/pkg/sandbox/reclaim.go @@ -176,12 +176,23 @@ func (s *Sandbox) guestSyncTimeout(ctx context.Context) time.Duration { // error instead of persisting a rootfs missing acknowledged writes. Unlike // bestEffortReclaim's sync step (LD-flag gated, best-effort), this always runs // and always reports failure. -func (s *Sandbox) guestSync(ctx context.Context) error { +func (s *Sandbox) guestSync(ctx context.Context) (e error) { syncTimeout := s.guestSyncTimeout(ctx) + start := time.Now() ctx, span := tracer.Start(ctx, "envd-guest-sync") defer span.End() + // Record on every exit so slow and timed-out syncs are captured too. + defer func() { + guestSyncDurationHistogram.Record(ctx, time.Since(start).Milliseconds(), + metric.WithAttributes( + attribute.Bool("success", e == nil), + attribute.Int64("timeout_ms", syncTimeout.Milliseconds()), + ), + ) + }() + rcCtx, cancel := context.WithTimeout(ctx, syncTimeout+reclaimOuterSlack) defer cancel() diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index b61f39949a..8d7567995c 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -50,6 +50,7 @@ var ( waitForEnvdDurationHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.WaitForEnvdDurationHistogramName)) envdCollapseDurationHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.EnvdCollapseDurationHistogramName)) envdCollapseChunks = utils.Must(telemetry.GetCounter(meter, telemetry.EnvdCollapseChunks)) + guestSyncDurationHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.GuestSyncDurationHistogramName)) uffdStartupPagesHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.UffdStartupPagesHistogramName)) uffdStartupSourcePagesHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.UffdStartupSourcePagesHistogramName)) diff --git a/packages/shared/pkg/telemetry/meters.go b/packages/shared/pkg/telemetry/meters.go index 678683e06e..2d77c8593e 100644 --- a/packages/shared/pkg/telemetry/meters.go +++ b/packages/shared/pkg/telemetry/meters.go @@ -84,6 +84,7 @@ const ( // Sandbox timing histograms OrchestratorSandboxCreateDurationName HistogramType = "orchestrator.sandbox.create.duration" WaitForEnvdDurationHistogramName HistogramType = "orchestrator.sandbox.envd.init.duration" + GuestSyncDurationHistogramName HistogramType = "orchestrator.sandbox.guest_sync.duration" // Pre-pause envd heap collapse round-trip duration (the pause-path cost of // POST /collapse: network plus envd's madvise work), recorded once per pause @@ -409,6 +410,7 @@ var histogramDesc = map[HistogramType]string{ OrchestratorSandboxCreateDurationName: "Time taken to create a sandbox", WaitForEnvdDurationHistogramName: "Time taken for Envd to initialize successfully", EnvdCollapseDurationHistogramName: "Time taken for the pre-pause envd heap collapse round-trip", + GuestSyncDurationHistogramName: "Time taken for the mandatory pre-pause guest sync (filesystem-only pause)", UffdStartupPagesHistogramName: "Demand-fault pages a guest needed to reach a successful envd init, per start", UffdStartupSourcePagesHistogramName: "Subset of startup demand-fault pages pulled from the source (e.g. GCS), per start", @@ -455,6 +457,7 @@ var histogramUnits = map[HistogramType]string{ OrchestratorSandboxCreateDurationName: "ms", WaitForEnvdDurationHistogramName: "ms", EnvdCollapseDurationHistogramName: "ms", + GuestSyncDurationHistogramName: "ms", UffdStartupPagesHistogramName: "{page}", UffdStartupSourcePagesHistogramName: "{page}", UffdStartupBytesHistogramName: "{By}",