diff --git a/packages/orchestrator/cmd/resume-build/main.go b/packages/orchestrator/cmd/resume-build/main.go index b280936a6a..dbde5882d0 100644 --- a/packages/orchestrator/cmd/resume-build/main.go +++ b/packages/orchestrator/cmd/resume-build/main.go @@ -71,6 +71,8 @@ func main() { cmdPause := flag.String("cmd-pause", "", "execute command in sandbox, then pause on success") cmdSignalPause := flag.String("cmd-signal-pause", "", "execute command in sandbox, then wait for SIGUSR1 before pausing") optimize := flag.Bool("optimize", false, "collect fresh prefetch mapping after pause (resumes snapshot to record page faults)") + fsOnly := flag.Bool("fs-only", false, "pause without a memory snapshot (filesystem-only; resume reboots the guest)") + reboot := flag.Bool("reboot", false, "cold-boot from the build's rootfs instead of resuming from memory") shell := flag.Bool("shell", false, "attach an interactive PTY shell via envd (no sshd required in the sandbox)") fphTimeoutMs := flag.Int("fph-timeout-ms", 0, "override free-page-hinting-config pause timeout LD flag (0 = use LD default)") @@ -157,6 +159,12 @@ func main() { if *optimize && *iterations > 0 { log.Fatal("-optimize is incompatible with -iterations (benchmarking doesn't upload)") } + if *fsOnly && !isPauseMode { + log.Fatal("-fs-only requires a pause flag (-pause, -signal-pause, -cmd-pause, or -cmd-signal-pause)") + } + if *fsOnly && (*optimize || *fphBench) { + log.Fatal("-fs-only is incompatible with -optimize and -fph-bench (no memory snapshot)") + } if *shell && (isCmdMode || isPauseMode || *iterations > 0) { log.Fatal("-shell can only be used in interactive mode (no -cmd, no pause flags, no -iterations)") @@ -193,6 +201,7 @@ func main() { newBuildID: outputBuildID, iterations: *iterations, optimize: *optimize, + fsOnly: *fsOnly, } runOpts := runOptions{ @@ -206,7 +215,7 @@ func main() { } fphBenchOpts := fphBenchOptions{enabled: *fphBench, workload: *cmdPause, iterations: benchIters, delay: *fphBenchDelay} - err := run(ctx, *fromBuild, *iterations, *coldStart, *noPrefetch, *noEgress, *verbose, *shell, pauseOpts, runOpts, fphBenchOpts) + err := run(ctx, *fromBuild, *iterations, *coldStart, *noPrefetch, *noEgress, *verbose, *shell, *reboot, pauseOpts, runOpts, fphBenchOpts) cancel() if err != nil { @@ -228,6 +237,7 @@ type pauseOptions struct { newBuildID string iterations int // for benchmarking pause (only with immediate) optimize bool + fsOnly bool } func (p pauseOptions) enabled() bool { @@ -322,10 +332,21 @@ type runner struct { coldStart bool noPrefetch bool shell bool + reboot bool config cfg.BuilderConfig storage storage.StorageProvider } +// startSandbox starts a sandbox from the build, either resuming from its memory +// snapshot or cold-booting (rebooting) from its rootfs when -reboot is set. +func (r *runner) startSandbox(ctx context.Context, runtime sandbox.RuntimeMetadata, start, end time.Time) (*sandbox.Sandbox, error) { + if r.reboot { + return r.factory.RebootSandbox(ctx, r.tmpl, r.sbxConfig, runtime, end, nil) + } + + return r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, start, end, nil) +} + func (r *runner) resumeOnce(ctx context.Context, iter int) (time.Duration, error) { runtime := sandbox.RuntimeMetadata{ TemplateID: r.buildID, @@ -335,7 +356,7 @@ func (r *runner) resumeOnce(ctx context.Context, iter int) (time.Duration, error } t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) dur := time.Since(t0) if sbx != nil { @@ -355,7 +376,7 @@ func (r *runner) interactive(ctx context.Context) error { fmt.Println("🚀 Starting...") t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) if err != nil { return err } @@ -405,7 +426,7 @@ func (r *runner) cmdOnce(ctx context.Context, opts runOptions, verbose bool) (cm fmt.Println("🚀 Starting sandbox...") } t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) resumeDur := time.Since(t0) if err != nil { return cmdTimings{resume: resumeDur, err: err}, err @@ -608,7 +629,7 @@ func (r *runner) pauseOnce(ctx context.Context, opts pauseOptions, verbose bool) fmt.Println("🚀 Starting sandbox...") } t0 := time.Now() - sbx, err := r.factory.ResumeSandbox(ctx, r.tmpl, r.sbxConfig, runtime, t0, t0.Add(24*time.Hour), nil) + sbx, err := r.startSandbox(ctx, runtime, t0, t0.Add(24*time.Hour)) resumeDur := time.Since(t0) if err != nil { return pauseTimings{resume: resumeDur, err: err}, err @@ -671,8 +692,12 @@ func (r *runner) pauseOnce(ctx context.Context, opts pauseOptions, verbose bool) } // Pause and create snapshot + var pauseSnapshotOpts []sandbox.PauseOption + if opts.fsOnly { + pauseSnapshotOpts = append(pauseSnapshotOpts, sandbox.WithFilesystemSnapshot()) + } pauseStart := time.Now() - snapshot, err := sbx.Pause(ctx, newMeta, sandbox.SnapshotUseCasePause) + snapshot, err := sbx.Pause(ctx, newMeta, sandbox.SnapshotUseCasePause, pauseSnapshotOpts...) pauseDur := time.Since(pauseStart) totalDur := time.Since(t0) @@ -1021,7 +1046,7 @@ func (r *runner) benchmark(ctx context.Context, n int) error { return lastErr } -func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefetch, noEgress, verbose, shell bool, pauseOpts pauseOptions, runOpts runOptions, fphBenchOpts fphBenchOptions) error { +func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefetch, noEgress, verbose, shell, reboot bool, pauseOpts pauseOptions, runOpts runOptions, fphBenchOpts fphBenchOptions) error { // Silence other loggers unless verbose mode var l logger.Logger if !verbose { @@ -1186,6 +1211,7 @@ func run(ctx context.Context, buildID string, iterations int, coldStart, noPrefe coldStart: coldStart, noPrefetch: noPrefetch, shell: shell, + reboot: reboot, config: config.BuilderConfig, storage: persistence, sbxConfig: sbxCfg, diff --git a/packages/orchestrator/pkg/sandbox/build_upload.go b/packages/orchestrator/pkg/sandbox/build_upload.go index bb6a73b4b9..1c73b6ff8c 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload.go +++ b/packages/orchestrator/pkg/sandbox/build_upload.go @@ -42,9 +42,17 @@ func NewUpload( useCase string, objectMetadata storage.ObjectMetadata, ) (*Upload, error) { - mem, memV4, err := resolveCompressConfig(ctx, cfg, ff, storage.MemfileName, snap.MemfileBlockSize, useCase) - if err != nil { - return nil, fmt.Errorf("resolve memfile compress config: %w", err) + // Filesystem-only snapshots have no memfile (NoDiff, block size 0), so + // resolving its compress config would fail validation ("block size must be + // positive"). The memfile body and header are never uploaded anyway. + var mem storage.CompressConfig + var memV4 bool + var err error + if !snap.FilesystemSnapshot { + mem, memV4, err = resolveCompressConfig(ctx, cfg, ff, storage.MemfileName, snap.MemorySnapshot.BlockSize, useCase) + if err != nil { + return nil, fmt.Errorf("resolve memfile compress config: %w", err) + } } root, rootV4, err := resolveCompressConfig(ctx, cfg, ff, storage.RootfsName, snap.RootfsBlockSize, useCase) if err != nil { diff --git a/packages/orchestrator/pkg/sandbox/build_upload_test.go b/packages/orchestrator/pkg/sandbox/build_upload_test.go index 7de081ad54..366ddd3824 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_test.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_test.go @@ -150,3 +150,39 @@ func TestAppendAncestorBuilds_NilDstSkipsSynthesis(t *testing.T) { err := u.appendAncestorBuilds(t.Context(), nil, mappingTo(t, 4096, ancestorID, 4096), build.Memfile) require.NoError(t, err) } + +// A filesystem-only snapshot has no memfile, so its MemorySnapshot.BlockSize is +// 0. NewUpload must skip resolving the memfile compress config for it — +// otherwise, with compression enabled, validateCompressConfig would reject the +// zero block size and fail the upload. FrameSizeKB is a multiple of the 4 KiB +// rootfs block so the rootfs config (which is always resolved) stays valid. +func TestNewUpload_FilesystemSnapshotSkipsMemfileCompressConfig(t *testing.T) { + t.Parallel() + + cfg := storage.CompressConfig{Enabled: true, Type: "zstd", FrameSizeKB: 256} + + t.Run("filesystem-only snapshot with zero memfile block size succeeds", func(t *testing.T) { + t.Parallel() + snap := &Snapshot{ + BuildID: uuid.New(), + FilesystemSnapshot: true, + RootfsBlockSize: 4096, + } + + u, err := NewUpload(t.Context(), nil, snap, nil, cfg, nil, storage.UseCaseBuild, storage.ObjectMetadata{}) + require.NoError(t, err) + require.NotNil(t, u) + }) + + t.Run("memory snapshot with zero memfile block size still errors", func(t *testing.T) { + t.Parallel() + snap := &Snapshot{ + BuildID: uuid.New(), + FilesystemSnapshot: false, + RootfsBlockSize: 4096, + } + + _, err := NewUpload(t.Context(), nil, snap, nil, cfg, nil, storage.UseCaseBuild, storage.ObjectMetadata{}) + require.Error(t, err) + }) +} diff --git a/packages/orchestrator/pkg/sandbox/build_upload_v3.go b/packages/orchestrator/pkg/sandbox/build_upload_v3.go index afc670cd9d..fe997e5c6b 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_v3.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_v3.go @@ -15,7 +15,7 @@ import ( ) func (u *Upload) runV3(ctx context.Context) error { - memfilePath, err := u.snap.MemfileDiff.CachePath(ctx) + memfilePath, err := u.snap.MemorySnapshot.Diff.CachePath(ctx) if err != nil { return fmt.Errorf("error getting memfile diff path: %w", err) } @@ -28,7 +28,7 @@ func (u *Upload) runV3(ctx context.Context) error { eg, egCtx := errgroup.WithContext(ctx) eg.Go(func() error { - h, err := u.snap.MemfileDiffHeader.WaitWithContext(egCtx) + h, err := u.snap.MemorySnapshot.DiffHeader.WaitWithContext(egCtx) if err != nil { return fmt.Errorf("wait memfile diff header: %w", err) } @@ -90,6 +90,12 @@ func (u *Upload) runV3(ctx context.Context) error { }) eg.Go(func() error { + // Filesystem-only snapshots resume by reboot, not snapfile restore, so + // the snapfile (created only for its disk-flush side effect) is not uploaded. + if u.snap.FilesystemSnapshot { + return nil + } + return uploadBlobWithMetrics(egCtx, u.store, u.paths.Snapfile(), storage.SnapfileObjectType, u.snap.Snapfile.Path(), uploadFileSnap, meta) }) @@ -103,7 +109,7 @@ func (u *Upload) runV3(ctx context.Context) error { // Body uploads done; headers must be ready by now (the per-file Goroutines // above already Wait-ed). Wait() is a fast lookup here. - memfileDiffHeader, err := u.snap.MemfileDiffHeader.WaitWithContext(ctx) + memfileDiffHeader, err := u.snap.MemorySnapshot.DiffHeader.WaitWithContext(ctx) if err != nil { return fmt.Errorf("wait memfile diff header: %w", err) } diff --git a/packages/orchestrator/pkg/sandbox/build_upload_v4.go b/packages/orchestrator/pkg/sandbox/build_upload_v4.go index 572d02362a..b768754396 100644 --- a/packages/orchestrator/pkg/sandbox/build_upload_v4.go +++ b/packages/orchestrator/pkg/sandbox/build_upload_v4.go @@ -16,7 +16,7 @@ import ( ) func (u *Upload) runV4(ctx context.Context) error { - memSrc, err := u.snap.MemfileDiff.CachePath(ctx) + memSrc, err := u.snap.MemorySnapshot.Diff.CachePath(ctx) if err != nil { return fmt.Errorf("memfile diff path: %w", err) } @@ -29,7 +29,7 @@ func (u *Upload) runV4(ctx context.Context) error { eg, ctx := errgroup.WithContext(ctx) eg.Go(func() error { - h, err := u.snap.MemfileDiffHeader.WaitWithContext(ctx) + h, err := u.snap.MemorySnapshot.DiffHeader.WaitWithContext(ctx) if err != nil { return fmt.Errorf("wait memfile diff header: %w", err) } @@ -55,6 +55,12 @@ func (u *Upload) runV4(ctx context.Context) error { meta := storage.WithMetadata(u.objectMetadata) eg.Go(func() error { + // Filesystem-only snapshots resume by reboot, not snapfile restore, so + // the snapfile (created only for its disk-flush side effect) is not uploaded. + if u.snap.FilesystemSnapshot { + return nil + } + return uploadBlobWithMetrics(ctx, u.store, u.paths.Snapfile(), storage.SnapfileObjectType, u.snap.Snapfile.Path(), uploadFileSnap, meta) }) diff --git a/packages/orchestrator/pkg/sandbox/fc/process.go b/packages/orchestrator/pkg/sandbox/fc/process.go index 761a0f138d..cf28fe7ed9 100644 --- a/packages/orchestrator/pkg/sandbox/fc/process.go +++ b/packages/orchestrator/pkg/sandbox/fc/process.go @@ -78,6 +78,13 @@ func (f *fcLogFilter) Write(p []byte) (n int, err error) { return len(p), err } +// ext4RootFlags are the ext4 mount flags passed on the kernel cmdline. +// discard: ext4 issues TRIM on freed blocks so they are elided from the +// snapshot diff. It must never include "noload": a filesystem-only snapshot +// resume cold-boots from the snapshot rootfs and relies on ext4 replaying the +// journal on mount. +const ext4RootFlags = "discard" + type ProcessOptions struct { // IoEngine is the io engine to use for the rootfs drive. IoEngine *string @@ -95,6 +102,14 @@ type ProcessOptions struct { // KvmClock is a flag to enable kvm-clock as the clocksource for the kernel. KvmClock bool + // AccessToken, when non-nil, makes Create write the guest MMDS metadata + // (sandbox/template IDs, logs address, and the access-token hash) before the + // VM boots, so a cold-booted envd can authenticate /init the same way it does + // after a memory resume. An empty string hashes to the "no token" value, + // matching Resume. Template-build cold boots leave it nil and skip the write, + // preserving their existing behavior. + AccessToken *string + // Stdout is the writer to which the process stdout will be written. Stdout io.Writer @@ -374,8 +389,7 @@ func (p *Process) Create( "i8042.noaux": "", "random.trust_cpu": "on", - // discard: ext4 issues TRIM on freed blocks so they are elided from the snapshot diff. - "rootflags": "discard", + "rootflags": ext4RootFlags, } if options.KvmClock { @@ -463,6 +477,27 @@ func (p *Process) Create( ) } + // Write MMDS metadata before boot when an access token is provided (the + // cold-boot/reboot user path) so the guest envd can authenticate /init the + // same way it does after a memory resume. The MMDS transport is already + // configured by setNetworkInterface above. Template-build cold boots leave + // AccessToken nil and skip this, preserving their existing behavior. + if options.AccessToken != nil { + md := sbxMetadata.LoggerMetadata() + meta := &MmdsMetadata{ + SandboxID: md.SandboxID, + TemplateID: md.TemplateID, + LogsCollectorAddress: fmt.Sprintf("http://%s/logs", p.config.NetworkConfig.OrchestratorInSandboxIPAddress), + AccessTokenHash: keys.HashAccessToken(*options.AccessToken), + } + if err := p.client.setMmds(ctx, meta); err != nil { + fcStopErr := p.Stop(ctx) + + return errors.Join(fmt.Errorf("error setting mmds: %w", err), fcStopErr) + } + telemetry.ReportEvent(ctx, "set fc mmds metadata") + } + err = p.client.startVM(ctx) if err != nil { fcStopErr := p.Stop(ctx) diff --git a/packages/orchestrator/pkg/sandbox/reboot.go b/packages/orchestrator/pkg/sandbox/reboot.go new file mode 100644 index 0000000000..10b542b5f6 --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/reboot.go @@ -0,0 +1,128 @@ +//go:build linux + +package sandbox + +import ( + "context" + "errors" + "fmt" + "time" + + "github.com/google/uuid" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/block" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/fc" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/template" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/template/constants" + "github.com/e2b-dev/infra/packages/orchestrator/pkg/units" + "github.com/e2b-dev/infra/packages/shared/pkg/fc/models" + "github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator" + "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" + "github.com/e2b-dev/infra/packages/shared/pkg/utils" +) + +const ( + // minEnvdVersionForKVMClock is the minimum envd version that supports kvm-clock. + minEnvdVersionForKVMClock = "0.2.11" + + // rebootEnvdTimeout bounds the systemd boot + envd start; a cold boot needs a + // longer window than a memory resume (matches the template build's wait). + rebootEnvdTimeout = 60 * time.Second +) + +// RebootSandbox cold-boots a fresh Firecracker VM from the template's rootfs, +// without restoring guest memory. Used to resume filesystem-only snapshots: +// guest RAM, processes, and sockets are lost; only the filesystem survives. +// The sandbox is marked running only after envd is ready, matching +// ResumeSandbox's routing guarantees; endAt is the caller's absolute end time. +// IMPORTANT: You must Close() the sandbox after you are done with it. +func (f *Factory) RebootSandbox( + ctx context.Context, + t template.Template, + config *Config, + runtime RuntimeMetadata, + endAt time.Time, + apiConfigToStore *orchestrator.SandboxConfig, +) (*Sandbox, error) { + ctx, span := tracer.Start(ctx, "reboot sandbox") + defer span.End() + + buildID, err := uuid.Parse(t.Files().BuildID) + if err != nil { + return nil, fmt.Errorf("parse build ID: %w", err) + } + + // The masked empty memfile is used only for sizing NoopMemory — guest RAM + // is FC's own fresh anonymous memory. + pageSize := int64(header.PageSize) + if config.HugePages { + pageSize = int64(header.HugepageSize) + } + memfile, err := block.NewEmpty(units.MBToBytes(config.RamMB), pageSize, buildID) + if err != nil { + return nil, fmt.Errorf("create empty memfile: %w", err) + } + + maskedTemplate := template.NewMaskTemplate(t, template.WithMemfile(memfile)) + + kvmClock, err := utils.IsGTEVersion(config.Envd.Version, minEnvdVersionForKVMClock) + if err != nil { + return nil, fmt.Errorf("compare envd version: %w", err) + } + + // Sync IO engine so no async writes are in flight if the sandbox is paused again. + ioEngine := models.DriveIoEngineSync + + // Always write MMDS metadata for a reboot so the cold-booted envd can + // authenticate /init like a memory resume. An empty token hashes to the + // "no token" value, matching ResumeSandbox's behavior for non-secure sandboxes. + accessToken := "" + if config.Envd.AccessToken != nil { + accessToken = *config.Envd.AccessToken + } + + timeout := time.Until(endAt) + if timeout <= 0 { + return nil, fmt.Errorf("sandbox end time %s is in the past", endAt) + } + + sbx, err := f.CreateSandbox( + ctx, + config, + runtime, + maskedTemplate, + timeout, + // Empty rootfs cache path selects the NBD provider, same as a memory + // resume, so guest TRIM keeps working and a later pause exports the + // overlay diff exactly like a normal resume. + "", + fc.ProcessOptions{ + InitScriptPath: constants.SystemdInitPath, + KvmClock: kvmClock, + IoEngine: &ioEngine, + AccessToken: &accessToken, + }, + apiConfigToStore, + nil, + WithDeferredMarkRunning(), + ) + if err != nil { + return nil, fmt.Errorf("create sandbox from rootfs: %w", err) + } + + // CreateSandbox anchors the lifetime to now; honor the caller's absolute end + // time so queue delay can't extend the TTL. + sbx.SetEndAt(endAt) + + if err := sbx.WaitForEnvd(ctx, StartTypeReboot, rebootEnvdTimeout); err != nil { + closeErr := sbx.Close(context.WithoutCancel(ctx)) + + return nil, errors.Join(fmt.Errorf("wait for envd after reboot: %w", err), closeErr) + } + + f.Sandboxes.MarkRunning(ctx, sbx) + + go sbx.Checks.Start(context.WithoutCancel(ctx)) + + return sbx, nil +} diff --git a/packages/orchestrator/pkg/sandbox/reclaim.go b/packages/orchestrator/pkg/sandbox/reclaim.go index 270be40e8d..dd9dcf09ee 100644 --- a/packages/orchestrator/pkg/sandbox/reclaim.go +++ b/packages/orchestrator/pkg/sandbox/reclaim.go @@ -8,6 +8,8 @@ import ( "strings" "time" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" "github.com/e2b-dev/infra/packages/shared/pkg/featureflags" @@ -23,6 +25,24 @@ const reclaimOuterSlack = 500 * time.Millisecond // must stay independent of the reclaim shell deadline. const freezeTimeout = 2 * time.Second +const ( + // syncMinTimeout floors the guest-sync deadline; it covers small-RAM + // sandboxes and the shell round-trip. + syncMinTimeout = 5 * time.Second + + // syncMaxTimeout caps the guest-sync deadline so a stuck sync still fails the + // pause in bounded time rather than hanging it. + syncMaxTimeout = 2 * time.Minute + + // syncFlushFloorBytesPerSec is a pessimistic floor for guest page-cache + // flush throughput to the virtio disk under IO contention. The data a sync + // must flush is bounded by the dirty page cache (≈ guest RAM; pages already + // written back are not re-flushed), so the deadline scales with RAM against + // this floor. Conservative on purpose: too low only over-waits, while too + // high would falsely fail the (mandatory) pre-pause sync. + syncFlushFloorBytesPerSec = 50 * 1024 * 1024 +) + // buildReclaimScript builds the fstrim/sync/drop_caches/compact_memory chain. // Returns ("", 0) when every step is disabled. func (s *Sandbox) buildReclaimScript(cfg featureflags.ReclaimConfig) (string, time.Duration) { @@ -108,6 +128,87 @@ func (s *Sandbox) bestEffortReclaim(ctx context.Context) { } } +// ramScaledSyncTimeout derives the guest-sync deadline from guest RAM. The +// dirty page cache that sync must flush is bounded by RAM, divided by a +// pessimistic flush-throughput floor, then clamped to +// [syncMinTimeout, syncMaxTimeout]. +func ramScaledSyncTimeout(ramMB int64) time.Duration { + ramBytes := ramMB * 1024 * 1024 + d := time.Duration(ramBytes/syncFlushFloorBytesPerSec) * time.Second + + if d < syncMinTimeout { + return syncMinTimeout + } + if d > syncMaxTimeout { + return syncMaxTimeout + } + + return d +} + +// guestSyncTimeout returns the deadline for the pre-pause guest sync. The +// GuestSyncTimeoutMs feature flag pins it (milliseconds) when set to a positive +// value; otherwise it scales with guest RAM via ramScaledSyncTimeout. +func (s *Sandbox) guestSyncTimeout(ctx context.Context) time.Duration { + if ms := s.featureFlags.IntFlag(ctx, featureflags.GuestSyncTimeoutMs, + featureflags.SandboxContext(s.Runtime.SandboxID), + featureflags.TeamContext(s.Runtime.TeamID), + featureflags.TemplateContext(s.Runtime.TemplateID), + ); ms > 0 { + return time.Duration(ms) * time.Millisecond + } + + return ramScaledSyncTimeout(s.Config.RamMB) +} + +// guestSync runs sync in the guest via envd so ext4 flushes dirty pages to the +// virtio disk. Mandatory before a filesystem-only pause: without a memory +// snapshot the guest page cache is lost, so callers must fail the pause on +// error instead of persisting a rootfs missing acknowledged writes. Unlike +// bestEffortReclaim's sync step (LD-flag gated, best-effort), this always runs +// and always reports failure. +func (s *Sandbox) guestSync(ctx context.Context) (e error) { + syncTimeout := s.guestSyncTimeout(ctx) + start := time.Now() + + ctx, span := tracer.Start(ctx, "envd-guest-sync") + defer span.End() + + // Record on every exit so slow and timed-out syncs are captured too. + defer func() { + guestSyncDurationHistogram.Record(ctx, time.Since(start).Milliseconds(), + metric.WithAttributes( + attribute.Bool("success", e == nil), + attribute.Int64("timeout_ms", syncTimeout.Milliseconds()), + ), + ) + }() + + rcCtx, cancel := context.WithTimeout(ctx, syncTimeout+reclaimOuterSlack) + defer cancel() + + stream, err := s.StartEnvdSystemShell(rcCtx, "/bin/sh", []string{"-c", "sync"}, "root", syncTimeout) + if err != nil { + return fmt.Errorf("start guest sync: %w", err) + } + defer stream.Close() + + exitCode := int32(-1) + for stream.Receive() { + if end := stream.Msg().GetEvent().GetEnd(); end != nil { + exitCode = end.GetExitCode() + } + } + if err := stream.Err(); err != nil { + return fmt.Errorf("guest sync stream: %w", err) + } + if exitCode != 0 { + return fmt.Errorf("guest sync exited with code %d", exitCode) + } + + return nil +} + // envdSupportsCgroupFreeze reports whether the sandbox's envd exposes the // native /freeze and /unfreeze endpoints. Bad version strings log and return // false so we never accidentally call an unsupported endpoint. diff --git a/packages/orchestrator/pkg/sandbox/reclaim_test.go b/packages/orchestrator/pkg/sandbox/reclaim_test.go new file mode 100644 index 0000000000..d5325b5f1f --- /dev/null +++ b/packages/orchestrator/pkg/sandbox/reclaim_test.go @@ -0,0 +1,72 @@ +//go:build linux + +package sandbox + +import ( + "testing" + "time" + + "github.com/launchdarkly/go-sdk-common/v3/ldvalue" + "github.com/launchdarkly/go-server-sdk/v7/testhelpers/ldtestdata" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/e2b-dev/infra/packages/shared/pkg/featureflags" +) + +func TestRamScaledSyncTimeout(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + ramMB int64 + want time.Duration + }{ + // 128 MiB / 50 MiB/s ≈ 2.5s -> clamped up to the floor. + {"small RAM clamps to min", 128, syncMinTimeout}, + // 1024 MiB / 50 MiB/s ≈ 20s. + {"scales with RAM", 1024, 20 * time.Second}, + // 128 GiB / 50 MiB/s ≈ 2621s -> clamped down to the cap. + {"large RAM clamps to max", 128 * 1024, syncMaxTimeout}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.want, ramScaledSyncTimeout(tt.ramMB)) + }) + } +} + +func newSandboxWithFF(t *testing.T, ramMB int64, td *ldtestdata.TestDataSource) *Sandbox { + t.Helper() + + ff, err := featureflags.NewClientWithDatasource(td) + require.NoError(t, err) + t.Cleanup(func() { _ = ff.Close(t.Context()) }) + + s := &Sandbox{Metadata: &Metadata{Config: &Config{RamMB: ramMB}}} + s.featureFlags = ff + + return s +} + +func TestGuestSyncTimeout_FlagOverride(t *testing.T) { + t.Parallel() + + t.Run("positive flag pins the timeout regardless of RAM", func(t *testing.T) { + t.Parallel() + td := ldtestdata.DataSource() + td.Update(td.Flag(featureflags.GuestSyncTimeoutMs.Key()).ValueForAll(ldvalue.Int(30000))) + s := newSandboxWithFF(t, 1024, td) // RAM-derived would be 20s + + assert.Equal(t, 30*time.Second, s.guestSyncTimeout(t.Context())) + }) + + t.Run("unset flag falls back to RAM-derived", func(t *testing.T) { + t.Parallel() + s := newSandboxWithFF(t, 1024, ldtestdata.DataSource()) + + assert.Equal(t, ramScaledSyncTimeout(1024), s.guestSyncTimeout(t.Context())) + }) +} diff --git a/packages/orchestrator/pkg/sandbox/sandbox.go b/packages/orchestrator/pkg/sandbox/sandbox.go index 8101f91400..e9fe9d5419 100644 --- a/packages/orchestrator/pkg/sandbox/sandbox.go +++ b/packages/orchestrator/pkg/sandbox/sandbox.go @@ -48,6 +48,7 @@ var ( meter = otel.Meter("github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox") envdInitCalls = utils.Must(telemetry.GetCounter(meter, telemetry.EnvdInitCalls)) waitForEnvdDurationHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.WaitForEnvdDurationHistogramName)) + guestSyncDurationHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.GuestSyncDurationHistogramName)) uffdStartupPagesHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.UffdStartupPagesHistogramName)) uffdStartupSourcePagesHistogram = utils.Must(telemetry.GetHistogram(meter, telemetry.UffdStartupSourcePagesHistogramName)) @@ -59,6 +60,7 @@ var ( const ( StartTypeCreate = "create" // cold boot (template build) StartTypeResume = "resume" // resume from a snapshot (the common runtime path) + StartTypeReboot = "reboot" // cold boot from a snapshot rootfs (filesystem-only resume) ) var SandboxHttpTransport = otelhttp.NewTransport( @@ -346,6 +348,20 @@ func (f *Factory) EgressProxy() network.EgressProxy { // on the host side. type PreBootFn func(ctx context.Context, rootfsPath string) error +type createOptions struct { + deferMarkRunning bool +} + +type CreateOption func(*createOptions) + +// WithDeferredMarkRunning skips marking the sandbox running inside CreateSandbox +// so the caller can mark it only after envd is ready, matching ResumeSandbox. +// Used by the reboot path, where the guest is cold-booting and must not be +// routable until envd answers. +func WithDeferredMarkRunning() CreateOption { + return func(o *createOptions) { o.deferMarkRunning = true } +} + // CreateSandbox creates the sandbox. // IMPORTANT: You must Close() the sandbox after you are done with it. func (f *Factory) CreateSandbox( @@ -358,11 +374,17 @@ func (f *Factory) CreateSandbox( processOptions fc.ProcessOptions, apiConfigToStore *orchestrator.SandboxConfig, preBootFn PreBootFn, + opts ...CreateOption, ) (s *Sandbox, e error) { ctx, span := tracer.Start(ctx, "create sandbox") defer span.End() defer handleSpanError(span, &e) + var createOpts createOptions + for _, opt := range opts { + opt(&createOpts) + } + execCtx, execSpan := startExecutionSpan(ctx) exit := utils.NewErrorOnce() @@ -586,7 +608,9 @@ func (f *Factory) CreateSandbox( exit.SetError(errors.Join(err, fcErr)) }() - f.Sandboxes.MarkRunning(ctx, sbx) + if !createOpts.deferMarkRunning { + f.Sandboxes.MarkRunning(ctx, sbx) + } return sbx, nil } @@ -1087,6 +1111,20 @@ func (s *Sandbox) Shutdown(ctx context.Context) error { return nil } +type pauseOptions struct { + filesystemSnapshot bool +} + +type PauseOption func(*pauseOptions) + +// WithFilesystemSnapshot makes the pause produce a filesystem-only snapshot: +// guest memory is not snapshotted, only the filesystem (rootfs) is persisted. +// Resuming such a snapshot reboots the guest instead of restoring memory state. +// The default (no option) is a full memory snapshot. +func WithFilesystemSnapshot() PauseOption { + return func(o *pauseOptions) { o.filesystemSnapshot = true } +} + // Pause creates a snapshot of the sandbox. // // Currently the memory snapshotting works like this: @@ -1098,12 +1136,25 @@ func (s *Sandbox) Shutdown(ctx context.Context) error { // that returns info about resident memory pages and about empty memory pages. // 5. Base on the info from the custom FC endpoint or from Uffd we copy the pages directly from the FC process to a local cache. // 6. We then can either close the sandbox or resume it. +// +// With WithFilesystemSnapshot(), steps 3-5 are skipped: a guest sync flushes +// the page cache to disk before pause, CreateSnapshot is still called for its +// disk drain+flush side effect (the snapfile is not uploaded), and the memfile +// diff is empty (NoDiff). func (s *Sandbox) Pause( ctx context.Context, m metadata.Template, useCase SnapshotUseCase, + opts ...PauseOption, ) (st *Snapshot, e error) { - ctx, span := tracer.Start(ctx, "sandbox-snapshot") + var pauseOpts pauseOptions + for _, opt := range opts { + opt(&pauseOpts) + } + + ctx, span := tracer.Start(ctx, "sandbox-snapshot", trace.WithAttributes( + attribute.Bool("fs-only-snapshot", pauseOpts.filesystemSnapshot), + )) defer span.End() cleanup := NewCleanup() @@ -1143,6 +1194,19 @@ func (s *Sandbox) Pause( return nil }) + if pauseOpts.filesystemSnapshot { + // FC never flushes the guest page cache and no memory snapshot will + // preserve it, so a failed sync would persist a rootfs missing + // acknowledged writes. Mandatory, unlike the best-effort reclaim above. + // The unfreeze cleanup is already registered, so failing here can't + // leave the live VM frozen. + if err := s.guestSync(ctx); err != nil { + return nil, fmt.Errorf("guest sync before filesystem-only pause: %w", err) + } + // Memory prefetch refers to the memfile, which is not persisted. + m.Prefetch = nil + } + // Drain free-page-hinting before pause so the snapshot doesn't capture // pages the guest already considers free. Timeout per use case; 0 disables. if t := featureflags.GetFreePageHintingTimeout(ctx, s.featureFlags, string(useCase), sandboxLDContext(s.Runtime, s.Config)); t > 0 { @@ -1165,63 +1229,38 @@ func (s *Sandbox) Pause( snapfile := template.NewLocalFileLink(cachePaths.CacheSnapfile()) cleanup.AddNoContext(ctx, snapfile.Close) + // CreateSnapshot also drains and flushes the virtio disk in our custom FC, so + // it runs even for a filesystem-only pause (which needs the disk flush); the + // resulting snapfile is just not uploaded in that case. err = s.process.CreateSnapshot(ctx, snapfile.Path()) if err != nil { return nil, fmt.Errorf("error creating snapshot: %w", err) } // Gather data for postprocessing - originalMemfile, err := s.Template.Memfile(ctx) - if err != nil { - return nil, fmt.Errorf("failed to get original memfile: %w", err) - } - originalRootfs, err := s.Template.Rootfs() if err != nil { return nil, fmt.Errorf("failed to get original rootfs: %w", err) } - memfileDiffMetadata, err := s.Resources.memory.DiffMetadata(ctx, s.process) - if err != nil { - return nil, fmt.Errorf("failed to get memfile metadata: %w", err) - } - recordSnapshotDiff(ctx, "memfile", memfileDiffMetadata, originalMemfile.Header()) - // Start POSTPROCESSING - var dedupBase block.ReadonlyDevice - var dedupBestEffort, dedupDirectIO bool - var dedupBudget block.DedupBudget - dedupCfg := s.featureFlags.JSONFlag(ctx, featureflags.MemfileDiffDedupFlag, sandboxLDContext(s.Runtime, s.Config)).AsValueMap() - if dedupCfg.Get("enabled").BoolValue() { - dedupBase = originalMemfile - dedupBestEffort = dedupCfg.Get("bestEffort").BoolValue() - dedupDirectIO = dedupCfg.Get("directIO").BoolValue() - dedupBudget = block.DedupBudget{ - MaxFetchWindowsPerBlock: dedupCfg.Get("maxFetchWindowsPerBlock").IntValue(), - MaxPromotedParentPagesPerBlock: dedupCfg.Get("maxPromotedParentPagesPerBlock").IntValue(), - MaxPagesPerPromotedFrame: dedupCfg.Get("maxPagesPerPromotedFrame").IntValue(), - BlockFaultPct: dedupCfg.Get("blockFaultPct").IntValue(), - FetchRunWindowPages: dedupCfg.Get("fetchRunWindowPages").IntValue(), + // + // For a filesystem-only pause the memory snapshot is skipped entirely: the + // memfile diff stays NoDiff with no header, and the memfile-derived fields + // stay zero so the snapshot and scheduling metadata carry rootfs only. + mem := MemorySnapshot{ + Diff: build.Diff(&build.NoDiff{}), + DiffHeader: NewResolvedDiffHeader(nil), + } + if !pauseOpts.filesystemSnapshot { + mem, err = s.processMemorySnapshot(ctx, buildID) + if err != nil { + return nil, err } } - memfileDiff, memfileDiffHeader, err := pauseProcessMemory( - ctx, - buildID, - originalMemfile.Header(), - memfileDiffMetadata, - s.config.DefaultCacheDir, - s.process, - s.memory.Memfd(ctx), - s.featureFlags.BoolFlag(ctx, featureflags.MemfdBackgroundCopyFlag, sandboxLDContext(s.Runtime, s.Config)), - dedupBase, - dedupBestEffort, - dedupDirectIO, - dedupBudget, - ) - if err != nil { - return nil, fmt.Errorf("error while post processing: %w", err) - } - cleanup.AddNoContext(ctx, memfileDiff.Close) + // NoDiff.Close is a no-op, so registering it for the filesystem-only case is + // harmless and keeps the cleanup ordering identical to the memory path. + cleanup.AddNoContext(ctx, mem.Diff.Close) rootfsDiff, rootfsHeader, err := pauseProcessRootfs( ctx, @@ -1247,8 +1286,8 @@ func (s *Sandbox) Pause( // base-identical ones, so it over-estimates. The rootfs copy is synchronous // today, so its new header carries the exact rootfs chain and bytes; if it // ever becomes async, switch it to the parent plus a dirty proxy like memfile. - newMemfileBytes := memfileDiffMetadata.Dirty.GetCardinality() * uint64(memfileDiffMetadata.BlockSize) - schedulingMetadata := scheduling.FromHeaders(buildID, originalMemfile.Header(), rootfsHeader, newMemfileBytes) + // mem.header is nil for a filesystem-only pause → rootfs-only metadata. + schedulingMetadata := scheduling.FromHeaders(buildID, mem.header, rootfsHeader, mem.newBytes) metadataFileLink := template.NewLocalFileLink(cachePaths.CacheMetadata()) cleanup.AddNoContext(ctx, metadataFileLink.Close) @@ -1261,12 +1300,11 @@ func (s *Sandbox) Pause( return &Snapshot{ Snapfile: snapfile, Metafile: metadataFileLink, - MemfileDiff: memfileDiff, - MemfileDiffHeader: memfileDiffHeader, + MemorySnapshot: mem, RootfsDiff: rootfsDiff, RootfsDiffHeader: rootfsDiffHeader, SchedulingMetadata: schedulingMetadata, - MemfileBlockSize: originalMemfile.Header().Metadata.BlockSize, + FilesystemSnapshot: pauseOpts.filesystemSnapshot, RootfsBlockSize: originalRootfs.Header().Metadata.BlockSize, BuildID: buildID, @@ -1275,6 +1313,86 @@ func (s *Sandbox) Pause( }, nil } +// MemorySnapshot bundles the products of memory postprocessing during a Pause: +// the memfile diff, its (async-resolved) header, and the block size. It is +// embedded in Snapshot. For a filesystem-only pause it is zero-valued except for +// an empty NoDiff and a resolved-nil header (see Snapshot.FilesystemSnapshot). +type MemorySnapshot struct { + Diff build.Diff + DiffHeader *DiffHeader + // BlockSize is captured synchronously at Pause time because NewUpload's + // compression validation needs it before the async dedup header resolves; + // the dedup memfile path produces a page-granular Diff.BlockSize() that + // doesn't match the chunker-read granularity on restore. + BlockSize uint64 + + // header (base memfile) and newBytes (pre-dedup dirty-byte upper bound) are + // scheduling inputs consumed only at Pause time, so they stay unexported. + header *header.Header + newBytes uint64 +} + +// processMemorySnapshot copies the dirty guest memory pages into a local diff +// and builds its header — steps 3-5 of Pause. Only called for a full memory +// snapshot; a filesystem-only pause skips it. The returned diff's Close must be +// registered for cleanup by the caller. +func (s *Sandbox) processMemorySnapshot(ctx context.Context, buildID uuid.UUID) (MemorySnapshot, error) { + originalMemfile, err := s.Template.Memfile(ctx) + if err != nil { + return MemorySnapshot{}, fmt.Errorf("failed to get original memfile: %w", err) + } + memfileHeader := originalMemfile.Header() + + memfileDiffMetadata, err := s.Resources.memory.DiffMetadata(ctx, s.process) + if err != nil { + return MemorySnapshot{}, fmt.Errorf("failed to get memfile metadata: %w", err) + } + recordSnapshotDiff(ctx, "memfile", memfileDiffMetadata, memfileHeader) + + var dedupBase block.ReadonlyDevice + var dedupBestEffort, dedupDirectIO bool + var dedupBudget block.DedupBudget + dedupCfg := s.featureFlags.JSONFlag(ctx, featureflags.MemfileDiffDedupFlag, sandboxLDContext(s.Runtime, s.Config)).AsValueMap() + if dedupCfg.Get("enabled").BoolValue() { + dedupBase = originalMemfile + dedupBestEffort = dedupCfg.Get("bestEffort").BoolValue() + dedupDirectIO = dedupCfg.Get("directIO").BoolValue() + dedupBudget = block.DedupBudget{ + MaxFetchWindowsPerBlock: dedupCfg.Get("maxFetchWindowsPerBlock").IntValue(), + MaxPromotedParentPagesPerBlock: dedupCfg.Get("maxPromotedParentPagesPerBlock").IntValue(), + MaxPagesPerPromotedFrame: dedupCfg.Get("maxPagesPerPromotedFrame").IntValue(), + BlockFaultPct: dedupCfg.Get("blockFaultPct").IntValue(), + FetchRunWindowPages: dedupCfg.Get("fetchRunWindowPages").IntValue(), + } + } + + memfileDiff, memfileDiffHeader, err := pauseProcessMemory( + ctx, + buildID, + memfileHeader, + memfileDiffMetadata, + s.config.DefaultCacheDir, + s.process, + s.memory.Memfd(ctx), + s.featureFlags.BoolFlag(ctx, featureflags.MemfdBackgroundCopyFlag, sandboxLDContext(s.Runtime, s.Config)), + dedupBase, + dedupBestEffort, + dedupDirectIO, + dedupBudget, + ) + if err != nil { + return MemorySnapshot{}, fmt.Errorf("error while post processing: %w", err) + } + + return MemorySnapshot{ + Diff: memfileDiff, + DiffHeader: memfileDiffHeader, + BlockSize: memfileHeader.Metadata.BlockSize, + header: memfileHeader, + newBytes: memfileDiffMetadata.Dirty.GetCardinality() * uint64(memfileDiffMetadata.BlockSize), + }, nil +} + // FlushAndReadBalloonMetrics triggers an FC metrics flush and returns the // updated cumulative virtio-balloon counters. Used by the FPH bench. func (s *Sandbox) FlushAndReadBalloonMetrics(ctx context.Context) (fc.BalloonMetricsSnapshot, error) { diff --git a/packages/orchestrator/pkg/sandbox/snapshot.go b/packages/orchestrator/pkg/sandbox/snapshot.go index 5c4ef05729..9b90397960 100644 --- a/packages/orchestrator/pkg/sandbox/snapshot.go +++ b/packages/orchestrator/pkg/sandbox/snapshot.go @@ -27,8 +27,10 @@ func NewResolvedDiffHeader(h *header.Header) *DiffHeader { } type Snapshot struct { - MemfileDiff build.Diff - MemfileDiffHeader *DiffHeader + // MemorySnapshot bundles the memfile diff, its header, and block size. It is + // empty (NoDiff) for filesystem-only snapshots (see FilesystemSnapshot). + MemorySnapshot MemorySnapshot + RootfsDiff build.Diff RootfsDiffHeader *DiffHeader Snapfile template.File @@ -36,13 +38,17 @@ type Snapshot struct { BuildID uuid.UUID SchedulingMetadata *orchestrator.SchedulingMetadata - // Template block sizes captured sync at Pause time. They equal - // MemfileDiffHeader.Metadata.BlockSize once that header resolves, but - // are needed sync by NewUpload's compression validation — the dedup - // memfile path produces a page-granular Diff.BlockSize() that doesn't - // match the chunker-read granularity on restore. - MemfileBlockSize uint64 - RootfsBlockSize uint64 + // FilesystemSnapshot is true for filesystem-only snapshots: the memfile diff + // is empty (NoDiff) and the memfile, memfile header, and snapfile are not + // uploaded. It records the decision made at pause time, which can't be + // inferred from the diff shape — a memory snapshot with zero dirty pages also + // produces a NoDiff memfile but still needs its snapfile uploaded. + FilesystemSnapshot bool + + // RootfsBlockSize is captured sync at Pause time — needed sync by NewUpload's + // compression validation. (The memfile block size lives in + // MemorySnapshot.BlockSize.) + RootfsBlockSize uint64 cleanup *Cleanup } diff --git a/packages/orchestrator/pkg/sandbox/template/storage_template.go b/packages/orchestrator/pkg/sandbox/template/storage_template.go index 955578cc11..7601b96198 100644 --- a/packages/orchestrator/pkg/sandbox/template/storage_template.go +++ b/packages/orchestrator/pkg/sandbox/template/storage_template.go @@ -276,18 +276,28 @@ func (t *storageTemplate) Files() storage.CachePaths { // rather than the header holders, which stay unset for templates loaded from // storage (the headers are resolved internally by NewStorage during Fetch). func (t *storageTemplate) SchedulingMetadata(ctx context.Context) *orchestrator.SchedulingMetadata { - memfile, memfileErr := t.memfile.WaitWithContext(ctx) + // The rootfs is always present; its header carries the build ID and is the + // minimum needed for scheduling metadata. rootfs, rootfsErr := t.rootfs.WaitWithContext(ctx) - if memfileErr != nil || rootfsErr != nil { + if rootfsErr != nil { return nil } - mh := memfile.Header() - if mh == nil || mh.Metadata == nil { + rh := rootfs.Header() + if rh == nil || rh.Metadata == nil { return nil } - return scheduling.FromHeaders(mh.Metadata.BuildId, mh, rootfs.Header(), 0) + // Filesystem-only snapshots have no memfile object, so memfile.WaitWithContext + // errors on reload. Tolerate that and report rootfs-only scheduling metadata + // (FromHeaders treats a nil memfile header as rootfs-only) instead of + // dropping the rootfs affinity data too. + var mh *header.Header + if memfile, memfileErr := t.memfile.WaitWithContext(ctx); memfileErr == nil { + mh = memfile.Header() + } + + return scheduling.FromHeaders(rh.Metadata.BuildId, mh, rh, 0) } func (t *storageTemplate) Memfile(ctx context.Context) (block.ReadonlyDevice, error) { diff --git a/packages/orchestrator/pkg/scheduling/metadata.go b/packages/orchestrator/pkg/scheduling/metadata.go index de6111ef4f..94ea7cad25 100644 --- a/packages/orchestrator/pkg/scheduling/metadata.go +++ b/packages/orchestrator/pkg/scheduling/metadata.go @@ -20,28 +20,35 @@ const chainLimit = 128 // derives the rest from the resolved parent headers and passes newMemfileBytes // (a pre-dedup upper bound) for the new layer. Lists are sorted by build ID — // order is not significant for affinity matching. +// +// A nil memfileHeader (filesystem-only snapshot) yields rootfs-only metadata +// with empty memfile fields; the rootfs header is always required. func FromHeaders(buildID uuid.UUID, memfileHeader, rootfsHeader *header.Header, newMemfileBytes uint64) *orchestrator.SchedulingMetadata { - if memfileHeader == nil || memfileHeader.Metadata == nil || rootfsHeader == nil || rootfsHeader.Metadata == nil { + if rootfsHeader == nil || rootfsHeader.Metadata == nil { return nil } - memfileBase := memfileHeader.Metadata.BaseBuildId rootfsBase := rootfsHeader.Metadata.BaseBuildId - - memIDs, memBytes, memDropped := artifactBuilds(memfileHeader, memfileBase, buildID, newMemfileBytes) rootIDs, rootBytes, rootDropped := artifactBuilds(rootfsHeader, rootfsBase, buildID, 0) - return &orchestrator.SchedulingMetadata{ - MemfileBaseBuildId: memfileBase.String(), - RootfsBaseBuildId: rootfsBase.String(), - BuildId: buildID.String(), - MemfileBuildIds: memIDs, - RootfsBuildIds: rootIDs, - MemfileBuildBytes: memBytes, - RootfsBuildBytes: rootBytes, - MemfileDroppedBuilds: uint32(memDropped), - RootfsDroppedBuilds: uint32(rootDropped), + md := &orchestrator.SchedulingMetadata{ + RootfsBaseBuildId: rootfsBase.String(), + BuildId: buildID.String(), + RootfsBuildIds: rootIDs, + RootfsBuildBytes: rootBytes, + RootfsDroppedBuilds: uint32(rootDropped), + } + + if memfileHeader != nil && memfileHeader.Metadata != nil { + memfileBase := memfileHeader.Metadata.BaseBuildId + memIDs, memBytes, memDropped := artifactBuilds(memfileHeader, memfileBase, buildID, newMemfileBytes) + md.MemfileBaseBuildId = memfileBase.String() + md.MemfileBuildIds = memIDs + md.MemfileBuildBytes = memBytes + md.MemfileDroppedBuilds = uint32(memDropped) } + + return md } // artifactBuilds returns the build IDs referenced by the header and their diff --git a/packages/orchestrator/pkg/scheduling/metadata_test.go b/packages/orchestrator/pkg/scheduling/metadata_test.go new file mode 100644 index 0000000000..bb0e3b2fe6 --- /dev/null +++ b/packages/orchestrator/pkg/scheduling/metadata_test.go @@ -0,0 +1,80 @@ +package scheduling_test + +import ( + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/scheduling" + "github.com/e2b-dev/infra/packages/shared/pkg/storage/header" +) + +func hdr(buildID, baseID uuid.UUID) *header.Header { + return &header.Header{ + Metadata: &header.Metadata{ + BuildId: buildID, + BaseBuildId: baseID, + BlockSize: 4096, + }, + } +} + +func TestFromHeaders(t *testing.T) { + t.Parallel() + + buildID := uuid.MustParse("11111111-1111-1111-1111-111111111111") + memBase := uuid.MustParse("22222222-2222-2222-2222-222222222222") + rootBase := uuid.MustParse("33333333-3333-3333-3333-333333333333") + + memHeader := hdr(buildID, memBase) + rootHeader := hdr(buildID, rootBase) + + t.Run("filesystem-only: nil memfile header yields rootfs-only metadata", func(t *testing.T) { + t.Parallel() + md := scheduling.FromHeaders(buildID, nil, rootHeader, 0) + + require.NotNil(t, md) + assert.Equal(t, buildID.String(), md.GetBuildId()) + assert.Equal(t, rootBase.String(), md.GetRootfsBaseBuildId()) + assert.NotEmpty(t, md.GetRootfsBuildIds()) + + // Memfile fields must be empty for a filesystem-only snapshot. + assert.Empty(t, md.GetMemfileBaseBuildId()) + assert.Empty(t, md.GetMemfileBuildIds()) + assert.Empty(t, md.GetMemfileBuildBytes()) + assert.Zero(t, md.GetMemfileDroppedBuilds()) + }) + + t.Run("memfile header with nil Metadata is treated as absent", func(t *testing.T) { + t.Parallel() + md := scheduling.FromHeaders(buildID, &header.Header{}, rootHeader, 0) + + require.NotNil(t, md) + assert.NotEmpty(t, md.GetRootfsBuildIds()) + assert.Empty(t, md.GetMemfileBaseBuildId()) + assert.Empty(t, md.GetMemfileBuildIds()) + }) + + t.Run("nil rootfs header yields nil", func(t *testing.T) { + t.Parallel() + assert.Nil(t, scheduling.FromHeaders(buildID, memHeader, nil, 0)) + }) + + t.Run("rootfs header with nil Metadata yields nil", func(t *testing.T) { + t.Parallel() + assert.Nil(t, scheduling.FromHeaders(buildID, memHeader, &header.Header{}, 0)) + }) + + t.Run("both headers present populate memfile and rootfs", func(t *testing.T) { + t.Parallel() + md := scheduling.FromHeaders(buildID, memHeader, rootHeader, 1024) + + require.NotNil(t, md) + assert.Equal(t, memBase.String(), md.GetMemfileBaseBuildId()) + assert.Equal(t, rootBase.String(), md.GetRootfsBaseBuildId()) + assert.NotEmpty(t, md.GetMemfileBuildIds()) + assert.NotEmpty(t, md.GetRootfsBuildIds()) + }) +} diff --git a/packages/orchestrator/pkg/server/sandboxes.go b/packages/orchestrator/pkg/server/sandboxes.go index 08d4b8df44..d6b35fd3c7 100644 --- a/packages/orchestrator/pkg/server/sandboxes.go +++ b/packages/orchestrator/pkg/server/sandboxes.go @@ -878,11 +878,11 @@ func (s *Server) snapshotAndCacheSandbox( err = s.templateCache.AddSnapshot( ctx, meta.Template.BuildID, - snapshot.MemfileDiffHeader, + snapshot.MemorySnapshot.DiffHeader, snapshot.RootfsDiffHeader, snapshot.Snapfile, snapshot.Metafile, - snapshot.MemfileDiff, + snapshot.MemorySnapshot.Diff, snapshot.RootfsDiff, ) if err != nil { diff --git a/packages/orchestrator/pkg/template/build/layer/layer_executor.go b/packages/orchestrator/pkg/template/build/layer/layer_executor.go index ad1f0ad23a..443e82574c 100644 --- a/packages/orchestrator/pkg/template/build/layer/layer_executor.go +++ b/packages/orchestrator/pkg/template/build/layer/layer_executor.go @@ -271,11 +271,11 @@ func (lb *LayerExecutor) PauseAndUpload( err = lb.templateCache.AddSnapshot( context.WithoutCancel(ctx), meta.Template.BuildID, - snapshot.MemfileDiffHeader, + snapshot.MemorySnapshot.DiffHeader, snapshot.RootfsDiffHeader, snapshot.Snapfile, snapshot.Metafile, - snapshot.MemfileDiff, + snapshot.MemorySnapshot.Diff, snapshot.RootfsDiff, ) if err != nil { diff --git a/packages/shared/pkg/featureflags/flags.go b/packages/shared/pkg/featureflags/flags.go index a181bde3a7..df718d09fc 100644 --- a/packages/shared/pkg/featureflags/flags.go +++ b/packages/shared/pkg/featureflags/flags.go @@ -220,6 +220,10 @@ var ( BestOfKAlpha = NewIntFlag("best-of-k-alpha", 50) // Default Alpha=0.5 (stored as percentage for int flag, current usage weight) EnvdInitTimeoutMilliseconds = NewIntFlag("envd-init-request-timeout-milliseconds", 50) // Timeout for envd init request in milliseconds HostStatsSamplingInterval = NewIntFlag("host-stats-sampling-interval", 5000) // Host stats sampling interval in milliseconds (default 5s) + // GuestSyncTimeoutMs overrides the mandatory pre-pause guest-sync deadline + // for filesystem-only snapshots, in milliseconds. 0 (default) derives the + // timeout from guest RAM; a positive value pins it. + GuestSyncTimeoutMs = NewIntFlag("guest-sync-timeout-milliseconds", 0) MaxCacheWriterConcurrencyFlag = NewIntFlag("max-cache-writer-concurrency", 10) // BuildCacheMaxUsagePercentage the maximum percentage of the cache disk storage diff --git a/packages/shared/pkg/telemetry/meters.go b/packages/shared/pkg/telemetry/meters.go index 477a9a879c..f3ef173638 100644 --- a/packages/shared/pkg/telemetry/meters.go +++ b/packages/shared/pkg/telemetry/meters.go @@ -79,6 +79,7 @@ const ( // Sandbox timing histograms OrchestratorSandboxCreateDurationName HistogramType = "orchestrator.sandbox.create.duration" WaitForEnvdDurationHistogramName HistogramType = "orchestrator.sandbox.envd.init.duration" + GuestSyncDurationHistogramName HistogramType = "orchestrator.sandbox.guest_sync.duration" // Sandbox startup working-set histograms: demand-fault pages/bytes a guest // needed to reach a successful envd init, recorded once per start. Sampled @@ -396,6 +397,7 @@ var histogramDesc = map[HistogramType]string{ BuildRootfsSizeHistogramName: "Size of the built template rootfs in bytes", OrchestratorSandboxCreateDurationName: "Time taken to create a sandbox", WaitForEnvdDurationHistogramName: "Time taken for Envd to initialize successfully", + GuestSyncDurationHistogramName: "Time taken for the mandatory pre-pause guest sync (filesystem-only pause)", UffdStartupPagesHistogramName: "Demand-fault pages a guest needed to reach a successful envd init, per start", UffdStartupSourcePagesHistogramName: "Subset of startup demand-fault pages pulled from the source (e.g. GCS), per start", @@ -441,6 +443,7 @@ var histogramUnits = map[HistogramType]string{ BuildRootfsSizeHistogramName: "{By}", OrchestratorSandboxCreateDurationName: "ms", WaitForEnvdDurationHistogramName: "ms", + GuestSyncDurationHistogramName: "ms", UffdStartupPagesHistogramName: "{page}", UffdStartupSourcePagesHistogramName: "{page}", UffdStartupBytesHistogramName: "{By}",