diff --git a/packages/orchestrator/pkg/factories/run.go b/packages/orchestrator/pkg/factories/run.go
index 513df32b44..9237f4ddc2 100644
--- a/packages/orchestrator/pkg/factories/run.go
+++ b/packages/orchestrator/pkg/factories/run.go
@@ -575,8 +575,8 @@ func run(config cfg.Config, opts Options) (success bool) {
 	if err != nil {
 		logger.L().Fatal(ctx, "failed to create orchestrator server", zap.Error(err))
 	}
-	closers = append(closers, closer{"orchestrator server", func(context.Context) error {
-		return orchestratorService.Close()
+	closers = append(closers, closer{"orchestrator server", func(ctx context.Context) error {
+		return orchestratorService.Close(ctx)
 	}})
 
 	// template manager sandbox logger
diff --git a/packages/orchestrator/pkg/sandbox/uploads.go b/packages/orchestrator/pkg/sandbox/uploads.go
index f04125e236..3e5cdef767 100644
--- a/packages/orchestrator/pkg/sandbox/uploads.go
+++ b/packages/orchestrator/pkg/sandbox/uploads.go
@@ -32,14 +32,19 @@ var (
 )
 
 const (
-	futureTTL = 1 * time.Hour
+	// futureTTL must outlive a parent upload's full retry window so a child's
+	// in-memory Wait still finds the parent's future. Keep >= the upload retry
+	// budget (server.uploadTotalBudget, 2h).
+	futureTTL = 3 * time.Hour
 
 	// refreshHeaderBudget bounds how long an upload Wait polls remote storage
 	// for a parent's V4 header. Crosses orchestrators: A may still be uploading
-	// on a remote orch when B's runV4 calls Wait(A) here. Matches the
-	// per-upload bound in server.uploadTimeout — anything longer means the
-	// parent's upload is itself stuck and would have failed on its own.
-	refreshHeaderBudget = 20 * time.Minute
+	// on a remote orch when B's runV4 calls Wait(A) here. It must be >= the
+	// parent's full retry window (server.uploadTotalBudget, 2h); otherwise the
+	// poll's budget expiry returns a non-retryable "object does not exist" and
+	// the child gives up while the parent is still retrying. The per-attempt
+	// context (server.uploadTimeout) bounds the actual poll duration.
+	refreshHeaderBudget = 2 * time.Hour
 
 	// uploadDoneChannelPrefix is the Redis pub/sub channel prefix for per-build
 	// upload-finished signals. Empty payload = success; non-empty = upload error.
diff --git a/packages/orchestrator/pkg/server/main.go b/packages/orchestrator/pkg/server/main.go
index 37c3a0f513..b93db7f7cd 100644
--- a/packages/orchestrator/pkg/server/main.go
+++ b/packages/orchestrator/pkg/server/main.go
@@ -6,6 +6,7 @@ import (
 	"context"
 	"fmt"
 	"sync"
+	"sync/atomic"
 	"time"
 
 	"github.com/jellydator/ttlcache/v3"
@@ -38,6 +39,10 @@ const uploadedBuildsTTL = 1 * time.Hour
 // MaxStartingInstancesPerNode feature flag and resize the semaphore.
 const startingSandboxesLimitRefreshInterval = 30 * time.Second
 
+// uploadDrainLogInterval is how often Close logs progress while waiting for
+// in-flight snapshot uploads to finish during shutdown.
+const uploadDrainLogInterval = 10 * time.Second
+
 type Server struct {
 	orchestrator.UnimplementedSandboxServiceServer
 	orchestrator.UnimplementedChunkServiceServer
@@ -58,6 +63,13 @@ type Server struct {
 	uploads               *sandbox.Uploads
 	sandboxCreateDuration metric.Int64Histogram
 	sandboxKilledCounter  metric.Int64Counter
+	uploadFailedCounter   metric.Int64Counter
+
+	// uploadsWG tracks in-flight async snapshot uploads so a graceful shutdown
+	// can wait for them to finish instead of dropping them. uploadsInFlight is
+	// the live count, used to log drain progress during shutdown.
+	uploadsWG       sync.WaitGroup
+	uploadsInFlight atomic.Int64
 
 	done      chan struct{}
 	closeOnce sync.Once
@@ -123,6 +135,12 @@ func New(ctx context.Context, cfg ServiceConfig) (*Server, error) {
 	}
 	server.sandboxKilledCounter = sandboxKilledCounter
 
+	uploadFailedCounter, err := telemetry.GetCounter(meter, telemetry.OrchestratorSnapshotUploadFailedCounterName)
+	if err != nil {
+		return nil, fmt.Errorf("failed to register snapshot upload failed counter: %w", err)
+	}
+	server.uploadFailedCounter = uploadFailedCounter
+
 	_, err = telemetry.GetObservableUpDownCounter(meter, telemetry.OrchestratorSandboxCountMeterName, func(_ context.Context, observer metric.Int64Observer) error {
 		observer.Observe(int64(server.sandboxFactory.Sandboxes.Count()))
 
@@ -156,16 +174,61 @@ func New(ctx context.Context, cfg ServiceConfig) (*Server, error) {
 	return server, nil
 }
 
-func (s *Server) Close() error {
+func (s *Server) Close(ctx context.Context) error {
 	s.closeOnce.Do(func() {
 		close(s.done)
 	})
 
+	// Wait for in-flight snapshot uploads to finish so a graceful shutdown
+	// doesn't drop a snapshot that is still uploading. ctx is cancelled on a
+	// forced stop, in which case we stop waiting and let the process exit.
+	uploadsDone := make(chan struct{})
+	go func() {
+		s.uploadsWG.Wait()
+		close(uploadsDone)
+	}()
+
+	s.drainUploads(ctx, uploadsDone)
+
 	s.uploadedBuilds.Stop()
 
 	return nil
 }
 
+// drainUploads waits for in-flight snapshot uploads to finish, logging progress
+// periodically, until they complete or ctx is cancelled (forced stop).
+func (s *Server) drainUploads(ctx context.Context, uploadsDone <-chan struct{}) {
+	inFlight := s.uploadsInFlight.Load()
+	if inFlight == 0 {
+		return
+	}
+
+	logger.L().Info(ctx, "waiting for in-flight snapshot uploads to finish", zap.Int64("uploads", inFlight))
+
+	ticker := time.NewTicker(uploadDrainLogInterval)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-uploadsDone:
+			logger.L().Info(ctx, "all in-flight snapshot uploads finished")
+
+			return
+		case <-ctx.Done():
+			logger.L().Warn(ctx, "shutting down with snapshot uploads still in flight",
+				zap.Int64("uploads", s.uploadsInFlight.Load()),
+				zap.Error(context.Cause(ctx)),
+			)
+
+			return
+		case <-ticker.C:
+			logger.L().Info(ctx, "still waiting for in-flight snapshot uploads",
+				zap.Int64("uploads", s.uploadsInFlight.Load()),
+			)
+		}
+	}
+}
+
 func (s *Server) refreshStartingSandboxesLimit(ctx context.Context) {
 	ticker := time.NewTicker(startingSandboxesLimitRefreshInterval)
 	defer ticker.Stop()
diff --git a/packages/orchestrator/pkg/server/sandboxes.go b/packages/orchestrator/pkg/server/sandboxes.go
index b1783f1381..1584550c16 100644
--- a/packages/orchestrator/pkg/server/sandboxes.go
+++ b/packages/orchestrator/pkg/server/sandboxes.go
@@ -33,6 +33,7 @@ import (
 	"github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator"
 	"github.com/e2b-dev/infra/packages/shared/pkg/logger"
 	sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox"
+	"github.com/e2b-dev/infra/packages/shared/pkg/retry"
 	"github.com/e2b-dev/infra/packages/shared/pkg/storage"
 	"github.com/e2b-dev/infra/packages/shared/pkg/telemetry"
 	"github.com/e2b-dev/infra/packages/shared/pkg/utils"
@@ -45,12 +46,25 @@ const (
 	// acquireTimeout is the max time to wait for a semaphore for resuming sandboxes snapshot.
 	acquireTimeout = 15 * time.Second
 
-	// uploadTimeout is the max time allowed for uploading snapshot files to
-	// remote storage.
+	// uploadTimeout is the max time allowed for a single upload attempt to
+	// remote storage. The overall retry window is uploadTotalBudget.
 	uploadTimeout = 20 * time.Minute
-	// redisPeerKeyTTL is slightly longer than uploadTimeout so the key is still
-	// valid for the entire upload window before being cleaned up.
-	redisPeerKeyTTL = uploadTimeout + 2*time.Minute
+	// uploadTotalBudget bounds how long a snapshot upload is retried before it
+	// is given up. Covers a long GCS outage without retrying forever.
+	uploadTotalBudget = 2 * time.Hour
+	// redisPeerKeyTTL keeps the peer routing key valid across the whole retry
+	// window so a long retry doesn't drop peer routing mid-upload. It is
+	// unregistered promptly once the upload finishes (success or give-up).
+	redisPeerKeyTTL = uploadTotalBudget + 2*time.Minute
+
+	// uploadRetryInitialBackoff is the wait before the first retry; it grows
+	// exponentially up to uploadRetryMaxBackoff.
+	uploadRetryInitialBackoff = 5 * time.Second
+	// uploadRetryMaxBackoff caps the backoff between attempts.
+	uploadRetryMaxBackoff = 2 * time.Minute
+	// uploadRetryBackoffMultiplier is the exponential growth factor between
+	// retry attempts.
+	uploadRetryBackoffMultiplier = 2
 
 	// executionEventDataKey is the key used in webhook event data for sandbox execution metrics.
 	executionEventDataKey = "execution"
@@ -860,7 +874,12 @@ func (s *Server) snapshotAndCacheSandbox(
 			return
 		}
 
-		s.uploadedBuilds.Set(meta.Template.BuildID, struct{}{}, ttlcache.DefaultTTL)
+		// Only advertise the build as fully uploaded when it actually landed.
+		// On abandon/failure the bytes are not in storage, so marking it would
+		// make chunk-serving falsely report "already uploaded".
+		if uploadErr == nil {
+			s.uploadedBuilds.Set(meta.Template.BuildID, struct{}{}, ttlcache.DefaultTTL)
+		}
 
 		if err := s.peerRegistry.Unregister(ctx, meta.Template.BuildID); err != nil {
 			logger.L().Warn(ctx, "failed to unregister peer address from routing", zap.String("build_id", meta.Template.BuildID), zap.Error(err))
@@ -885,23 +904,40 @@ func (s *Server) snapshotAndCacheSandbox(
 // background and cleans up the Redis peer key once done. Used by the Pause
 // handler where no prefetch data is available.
 func (s *Server) uploadSnapshotAsync(ctx context.Context, sbx *sandbox.Sandbox, res *snapshotResult) {
-	ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), uploadTimeout)
+	// Detach from the request: the upload retries for up to uploadTotalBudget.
+	// A graceful shutdown waits for it to finish (see Server.Close via uploadsWG)
+	// rather than cancelling, so an in-flight snapshot isn't dropped on restart.
+	uploadCtx := context.WithoutCancel(ctx)
 
-	go func() {
-		defer cancel()
+	s.uploadsInFlight.Add(1)
+	s.uploadsWG.Go(func() {
+		defer s.uploadsInFlight.Add(-1)
 
-		ctx, span := tracer.Start(ctx, "upload snapshot")
+		spanCtx, span := tracer.Start(uploadCtx, "upload snapshot")
 		defer span.End()
 
-		err := res.upload.Run(ctx)
+		err := retry.Do(
+			spanCtx,
+			defaultUploadRetryPolicy(),
+			isRetryableUploadErr,
+			res.upload.Run,
+			func(attempt int, backoff time.Duration, err error) {
+				sbxlogger.I(sbx).Warn(spanCtx, "snapshot upload attempt failed, retrying",
+					zap.Int("attempt", attempt),
+					zap.Duration("backoff", backoff),
+					zap.Error(err),
+				)
+			},
+		)
 		if err != nil {
-			sbxlogger.I(sbx).Error(ctx, "error uploading snapshot files", zap.Error(err))
+			sbxlogger.I(sbx).Error(spanCtx, "snapshot upload did not durably land", zap.Error(err))
+			s.uploadFailedCounter.Add(spanCtx, 1)
 		} else {
-			sbxlogger.I(sbx).Info(ctx, "snapshot finished uploading successfully")
+			sbxlogger.I(sbx).Info(spanCtx, "snapshot finished uploading successfully")
 		}
 
-		res.completeUpload(ctx, err)
-	}()
+		res.completeUpload(spanCtx, err)
+	})
 }
 
 // setupSandboxLifecycle sets up the cleanup goroutine for a sandbox.
diff --git a/packages/orchestrator/pkg/server/upload_retry.go b/packages/orchestrator/pkg/server/upload_retry.go
new file mode 100644
index 0000000000..dfcbd34cb9
--- /dev/null
+++ b/packages/orchestrator/pkg/server/upload_retry.go
@@ -0,0 +1,44 @@
+//go:build linux
+
+package server
+
+import (
+	"context"
+	"errors"
+
+	"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/build"
+	"github.com/e2b-dev/infra/packages/shared/pkg/retry"
+	"github.com/e2b-dev/infra/packages/shared/pkg/storage"
+)
+
+// defaultUploadRetryPolicy is the retry policy for pause-snapshot uploads:
+// retry with a fresh per-attempt timeout under the total budget, with
+// exponential backoff.
+func defaultUploadRetryPolicy() retry.Policy {
+	return retry.Policy{
+		TotalBudget:    uploadTotalBudget,
+		AttemptTimeout: uploadTimeout,
+		InitialBackoff: uploadRetryInitialBackoff,
+		MaxBackoff:     uploadRetryMaxBackoff,
+		Multiplier:     uploadRetryBackoffMultiplier,
+	}
+}
+
+// isRetryableUploadErr classifies an upload failure. The default is RETRYABLE:
+// a lost snapshot is unrecoverable and cascades to descendants, so a wasted
+// retry is far cheaper than dropping a recoverable build. Only genuinely
+// terminal conditions stop the loop.
+func isRetryableUploadErr(err error) bool {
+	switch {
+	case errors.Is(err, build.NoDiffError{}):
+		return false // nothing to upload
+	case errors.Is(err, storage.ErrObjectNotExist):
+		return false // source vanished; retry cannot recover it
+	case errors.Is(err, context.Canceled):
+		return false // parent cancelled (shutdown)
+	default:
+		// Includes per-attempt context.DeadlineExceeded, GCS 401/503, rate
+		// limiting, and unknown errors — all worth retrying within the budget.
+		return true
+	}
+}
diff --git a/packages/orchestrator/pkg/server/upload_retry_test.go b/packages/orchestrator/pkg/server/upload_retry_test.go
new file mode 100644
index 0000000000..1d0ac73c5a
--- /dev/null
+++ b/packages/orchestrator/pkg/server/upload_retry_test.go
@@ -0,0 +1,40 @@
+//go:build linux
+
+package server
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+
+	"github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/build"
+	"github.com/e2b-dev/infra/packages/shared/pkg/storage"
+)
+
+func TestIsRetryableUploadErr(t *testing.T) {
+	t.Parallel()
+
+	tests := []struct {
+		name      string
+		err       error
+		retryable bool
+	}{
+		{"no diff", build.NoDiffError{}, false},
+		{"object not exist", storage.ErrObjectNotExist, false},
+		{"object not exist wrapped", fmt.Errorf("load: %w", storage.ErrObjectNotExist), false},
+		{"parent cancelled", context.Canceled, false},
+		{"per-attempt deadline", context.DeadlineExceeded, true},
+		{"gcs 503", errors.New("server error (503)"), true},
+		{"unknown", errors.New("boom"), true},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			t.Parallel()
+			assert.Equal(t, tt.retryable, isRetryableUploadErr(tt.err))
+		})
+	}
+}
diff --git a/packages/shared/pkg/retry/retry.go b/packages/shared/pkg/retry/retry.go
new file mode 100644
index 0000000000..aec73c23b1
--- /dev/null
+++ b/packages/shared/pkg/retry/retry.go
@@ -0,0 +1,121 @@
+package retry
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+)
+
+// ErrBudgetExhausted is returned (wrapped) when fn never succeeded within
+// Policy.TotalBudget.
+var ErrBudgetExhausted = errors.New("retry budget exhausted")
+
+// Policy configures Do.
+type Policy struct {
+	// TotalBudget bounds the wall-clock time across all attempts. Required
+	// (a non-positive value makes Do fail immediately with ErrBudgetExhausted).
+	TotalBudget time.Duration
+	// AttemptTimeout bounds a single attempt. 0 means no per-attempt timeout —
+	// each attempt is bounded only by the remaining budget.
+	AttemptTimeout time.Duration
+	// InitialBackoff is the wait before the first retry.
+	InitialBackoff time.Duration
+	// MaxBackoff caps the backoff between attempts. 0 means uncapped.
+	MaxBackoff time.Duration
+	// Multiplier is the exponential growth factor between attempts (< 1 is
+	// treated as 1, i.e. constant backoff).
+	Multiplier int
+}
+
+// Do runs fn with retries until it returns nil, retryable reports the error as
+// non-retryable, the budget is exhausted, or ctx is cancelled.
+//
+// The whole call runs under a single budget context derived from ctx, so each
+// attempt's context is capped to the remaining budget and the loop never runs
+// past TotalBudget. fn must respect the context it is given.
+//
+// retryable classifies an error as worth retrying; a nil retryable treats every
+// error as retryable. onRetry, if non-nil, is called before each backoff sleep
+// with the 1-based attempt number, the upcoming backoff, and the error that
+// triggered the retry.
+func Do(
+	ctx context.Context,
+	policy Policy,
+	retryable func(error) bool,
+	fn func(context.Context) error,
+	onRetry func(attempt int, backoff time.Duration, err error),
+) error {
+	budgetCtx, cancel := context.WithTimeoutCause(ctx, policy.TotalBudget, ErrBudgetExhausted)
+	defer cancel()
+
+	backoff := policy.InitialBackoff
+
+	for attempt := 1; ; attempt++ {
+		err := runAttempt(budgetCtx, policy.AttemptTimeout, fn)
+		if err == nil {
+			return nil
+		}
+
+		// Budget exhausted or parent cancelled: stop. Checking the budget
+		// context (not err) distinguishes these from a per-attempt timeout,
+		// which leaves budgetCtx alive and is retryable.
+		if budgetCtx.Err() != nil {
+			return stopError(budgetCtx, attempt, err)
+		}
+
+		if retryable != nil && !retryable(err) {
+			return fmt.Errorf("non-retryable error after %d attempts: %w", attempt, err)
+		}
+
+		if onRetry != nil {
+			onRetry(attempt, backoff, err)
+		}
+
+		select {
+		case <-budgetCtx.Done():
+			return stopError(budgetCtx, attempt, err)
+		case <-time.After(backoff):
+		}
+
+		backoff = nextBackoff(backoff, policy.MaxBackoff, policy.Multiplier)
+	}
+}
+
+// runAttempt runs a single attempt under a fresh per-attempt timeout derived
+// from ctx. Because the attempt context derives from the budget context, its
+// effective deadline is min(attemptTimeout, remaining budget).
+func runAttempt(ctx context.Context, attemptTimeout time.Duration, fn func(context.Context) error) error {
+	if attemptTimeout <= 0 {
+		return fn(ctx)
+	}
+
+	attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout)
+	defer cancel()
+
+	return fn(attemptCtx)
+}
+
+func nextBackoff(cur, maxBackoff time.Duration, multiplier int) time.Duration {
+	if multiplier < 1 {
+		multiplier = 1
+	}
+
+	next := cur * time.Duration(multiplier)
+	if maxBackoff > 0 && next > maxBackoff {
+		return maxBackoff
+	}
+
+	return next
+}
+
+// stopError maps a stopped budget context to a terminal error: budget
+// exhaustion vs. parent cancellation (e.g. caller shutdown).
+func stopError(budgetCtx context.Context, attempt int, lastErr error) error {
+	cause := context.Cause(budgetCtx)
+	if errors.Is(cause, ErrBudgetExhausted) {
+		return fmt.Errorf("%w after %d attempts: %w", ErrBudgetExhausted, attempt, lastErr)
+	}
+
+	return errors.Join(lastErr, cause)
+}
diff --git a/packages/shared/pkg/retry/retry_test.go b/packages/shared/pkg/retry/retry_test.go
new file mode 100644
index 0000000000..ce764ef705
--- /dev/null
+++ b/packages/shared/pkg/retry/retry_test.go
@@ -0,0 +1,175 @@
+package retry
+
+import (
+	"context"
+	"errors"
+	"sync/atomic"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func fastPolicy() Policy {
+	return Policy{
+		TotalBudget:    2 * time.Second,
+		AttemptTimeout: 50 * time.Millisecond,
+		InitialBackoff: time.Millisecond,
+		MaxBackoff:     5 * time.Millisecond,
+		Multiplier:     2,
+	}
+}
+
+func TestDo_RetriesThenSucceeds(t *testing.T) {
+	t.Parallel()
+
+	var attempts atomic.Int32
+	fn := func(context.Context) error {
+		if attempts.Add(1) < 3 {
+			return errors.New("transient")
+		}
+
+		return nil
+	}
+
+	require.NoError(t, Do(context.Background(), fastPolicy(), nil, fn, nil))
+	assert.EqualValues(t, 3, attempts.Load())
+}
+
+func TestDo_BudgetExhausted(t *testing.T) {
+	t.Parallel()
+
+	var attempts atomic.Int32
+	fn := func(context.Context) error {
+		attempts.Add(1)
+
+		return errors.New("persistent")
+	}
+
+	err := Do(context.Background(), fastPolicy(), nil, fn, nil)
+	require.Error(t, err)
+	require.ErrorIs(t, err, ErrBudgetExhausted)
+	assert.Greater(t, attempts.Load(), int32(1))
+}
+
+func TestDo_PerAttemptTimeoutDoesNotAbortLoop(t *testing.T) {
+	t.Parallel()
+
+	var attempts atomic.Int32
+	fn := func(ctx context.Context) error {
+		if attempts.Add(1) == 1 {
+			<-ctx.Done() // first attempt blows its per-attempt deadline
+
+			return ctx.Err()
+		}
+
+		return nil
+	}
+
+	require.NoError(t, Do(context.Background(), fastPolicy(), nil, fn, nil))
+	assert.EqualValues(t, 2, attempts.Load())
+}
+
+func TestDo_CapsAttemptToRemainingBudget(t *testing.T) {
+	t.Parallel()
+
+	// AttemptTimeout (10s) far exceeds the budget (100ms): a blocking attempt
+	// must be cut off at the budget, not run for the full per-attempt timeout.
+	policy := Policy{
+		TotalBudget:    100 * time.Millisecond,
+		AttemptTimeout: 10 * time.Second,
+		InitialBackoff: time.Millisecond,
+		MaxBackoff:     5 * time.Millisecond,
+		Multiplier:     2,
+	}
+
+	fn := func(ctx context.Context) error {
+		<-ctx.Done()
+
+		return ctx.Err()
+	}
+
+	start := time.Now()
+	err := Do(context.Background(), policy, nil, fn, nil)
+	elapsed := time.Since(start)
+
+	require.ErrorIs(t, err, ErrBudgetExhausted)
+	assert.Less(t, elapsed, 2*time.Second)
+}
+
+func TestDo_NonRetryableStops(t *testing.T) {
+	t.Parallel()
+
+	sentinel := errors.New("permanent")
+	var attempts atomic.Int32
+	fn := func(context.Context) error {
+		attempts.Add(1)
+
+		return sentinel
+	}
+	retryable := func(err error) bool { return !errors.Is(err, sentinel) }
+
+	err := Do(context.Background(), fastPolicy(), retryable, fn, nil)
+	require.ErrorIs(t, err, sentinel)
+	assert.EqualValues(t, 1, attempts.Load())
+}
+
+func TestDo_ParentCancelAborts(t *testing.T) {
+	t.Parallel()
+
+	ctx, cancel := context.WithCancel(context.Background())
+	fn := func(context.Context) error {
+		cancel()
+
+		return errors.New("failed before cancel observed")
+	}
+
+	err := Do(ctx, fastPolicy(), nil, fn, nil)
+	require.Error(t, err)
+	require.ErrorIs(t, err, context.Canceled)
+	assert.NotErrorIs(t, err, ErrBudgetExhausted)
+}
+
+func TestDo_NoAttemptTimeoutUsesBudget(t *testing.T) {
+	t.Parallel()
+
+	// AttemptTimeout == 0: the attempt is bounded only by the remaining budget.
+	policy := Policy{
+		TotalBudget:    80 * time.Millisecond,
+		AttemptTimeout: 0,
+		InitialBackoff: time.Millisecond,
+		MaxBackoff:     5 * time.Millisecond,
+		Multiplier:     2,
+	}
+
+	fn := func(ctx context.Context) error {
+		<-ctx.Done()
+
+		return ctx.Err()
+	}
+
+	start := time.Now()
+	err := Do(context.Background(), policy, nil, fn, nil)
+
+	require.ErrorIs(t, err, ErrBudgetExhausted)
+	assert.Less(t, time.Since(start), time.Second)
+}
+
+func TestDo_OnRetryInvoked(t *testing.T) {
+	t.Parallel()
+
+	var retries atomic.Int32
+	var attempts atomic.Int32
+	fn := func(context.Context) error {
+		if attempts.Add(1) < 3 {
+			return errors.New("transient")
+		}
+
+		return nil
+	}
+	onRetry := func(int, time.Duration, error) { retries.Add(1) }
+
+	require.NoError(t, Do(context.Background(), fastPolicy(), nil, fn, onRetry))
+	assert.EqualValues(t, 2, retries.Load(), "onRetry fires once per retry (not the final success)")
+}
diff --git a/packages/shared/pkg/telemetry/meters.go b/packages/shared/pkg/telemetry/meters.go
index bde79e2e65..21fd8920bd 100644
--- a/packages/shared/pkg/telemetry/meters.go
+++ b/packages/shared/pkg/telemetry/meters.go
@@ -33,6 +33,11 @@ const (
 
 	OrchestratorSandboxKilledCounterName CounterType = "orchestrator.sandbox.killed"
 
+	// OrchestratorSnapshotUploadFailedCounterName counts pause-snapshot uploads
+	// that never landed durably (budget exhausted or a non-retryable error).
+	// A non-zero rate means lost snapshots.
+	OrchestratorSnapshotUploadFailedCounterName CounterType = "orchestrator.snapshot.upload.failed"
+
 	ApiRedisStoragePublisherPublished CounterType = "api.redis_storage.publisher.published"
 	ApiRedisStoragePublisherDropped   CounterType = "api.redis_storage.publisher.dropped"
 )
@@ -166,17 +171,18 @@ const (
 )
 
 var counterDesc = map[CounterType]string{
-	SandboxCreateMeterName:                   "Number of currently waiting requests to create a new sandbox",
-	ApiOrchestratorCreatedSandboxes:          "Number of successfully created sandboxes",
-	BuildResultCounterName:                   "Number of template build results",
-	BuildCacheResultCounterName:              "Number of build cache results",
-	TeamSandboxCreated:                       "Counter of started sandboxes for the team in the interval",
-	OrchestratorHostBalanceDirtyPagesThreads: "Cumulative stalled thread-polls during sandbox resume; rate() gives throttle intensity",
-	EnvdInitCalls:                            "Number of envd initialization calls",
-	OrchestratorSandboxKilledCounterName:     "Number of sandboxes killed, labeled by kill reason",
-	TCPFirewallConnectionsTotal:              "Total number of TCP firewall connections processed",
-	TCPFirewallErrorsTotal:                   "Total number of TCP firewall errors",
-	TCPFirewallDecisionsTotal:                "Total number of TCP firewall allow/block decisions",
+	SandboxCreateMeterName:                      "Number of currently waiting requests to create a new sandbox",
+	ApiOrchestratorCreatedSandboxes:             "Number of successfully created sandboxes",
+	BuildResultCounterName:                      "Number of template build results",
+	BuildCacheResultCounterName:                 "Number of build cache results",
+	TeamSandboxCreated:                          "Counter of started sandboxes for the team in the interval",
+	OrchestratorHostBalanceDirtyPagesThreads:    "Cumulative stalled thread-polls during sandbox resume; rate() gives throttle intensity",
+	EnvdInitCalls:                               "Number of envd initialization calls",
+	OrchestratorSandboxKilledCounterName:        "Number of sandboxes killed, labeled by kill reason",
+	OrchestratorSnapshotUploadFailedCounterName: "Number of pause-snapshot uploads that never landed durably",
+	TCPFirewallConnectionsTotal:                 "Total number of TCP firewall connections processed",
+	TCPFirewallErrorsTotal:                      "Total number of TCP firewall errors",
+	TCPFirewallDecisionsTotal:                   "Total number of TCP firewall allow/block decisions",
 
 	IngressProxyConnectionsBlockedTotal: "Total number of ingress proxy connections blocked by connection limit",
 	CmuxErrorsTotal:                     "Total number of cmux connection multiplexer errors",
@@ -193,17 +199,18 @@ var counterDesc = map[CounterType]string{
 }
 
 var counterUnits = map[CounterType]string{
-	SandboxCreateMeterName:                   "{sandbox}",
-	ApiOrchestratorCreatedSandboxes:          "{sandbox}",
-	BuildResultCounterName:                   "{build}",
-	BuildCacheResultCounterName:              "{layer}",
-	TeamSandboxCreated:                       "{sandbox}",
-	OrchestratorHostBalanceDirtyPagesThreads: "{thread}",
-	EnvdInitCalls:                            "1",
-	OrchestratorSandboxKilledCounterName:     "{sandbox}",
-	TCPFirewallConnectionsTotal:              "{connection}",
-	TCPFirewallErrorsTotal:                   "{error}",
-	TCPFirewallDecisionsTotal:                "{decision}",
+	SandboxCreateMeterName:                      "{sandbox}",
+	ApiOrchestratorCreatedSandboxes:             "{sandbox}",
+	BuildResultCounterName:                      "{build}",
+	BuildCacheResultCounterName:                 "{layer}",
+	TeamSandboxCreated:                          "{sandbox}",
+	OrchestratorHostBalanceDirtyPagesThreads:    "{thread}",
+	EnvdInitCalls:                               "1",
+	OrchestratorSandboxKilledCounterName:        "{sandbox}",
+	OrchestratorSnapshotUploadFailedCounterName: "{snapshot}",
+	TCPFirewallConnectionsTotal:                 "{connection}",
+	TCPFirewallErrorsTotal:                      "{error}",
+	TCPFirewallDecisionsTotal:                   "{decision}",
 
 	IngressProxyConnectionsBlockedTotal: "{connection}",
 	CmuxErrorsTotal:                     "{error}",