diff --git a/packages/orchestrator/pkg/factories/run.go b/packages/orchestrator/pkg/factories/run.go index 513df32b44..9237f4ddc2 100644 --- a/packages/orchestrator/pkg/factories/run.go +++ b/packages/orchestrator/pkg/factories/run.go @@ -575,8 +575,8 @@ func run(config cfg.Config, opts Options) (success bool) { if err != nil { logger.L().Fatal(ctx, "failed to create orchestrator server", zap.Error(err)) } - closers = append(closers, closer{"orchestrator server", func(context.Context) error { - return orchestratorService.Close() + closers = append(closers, closer{"orchestrator server", func(ctx context.Context) error { + return orchestratorService.Close(ctx) }}) // template manager sandbox logger diff --git a/packages/orchestrator/pkg/sandbox/uploads.go b/packages/orchestrator/pkg/sandbox/uploads.go index f04125e236..3e5cdef767 100644 --- a/packages/orchestrator/pkg/sandbox/uploads.go +++ b/packages/orchestrator/pkg/sandbox/uploads.go @@ -32,14 +32,19 @@ var ( ) const ( - futureTTL = 1 * time.Hour + // futureTTL must outlive a parent upload's full retry window so a child's + // in-memory Wait still finds the parent's future. Keep >= the upload retry + // budget (server.uploadTotalBudget, 2h). + futureTTL = 3 * time.Hour // refreshHeaderBudget bounds how long an upload Wait polls remote storage // for a parent's V4 header. Crosses orchestrators: A may still be uploading - // on a remote orch when B's runV4 calls Wait(A) here. Matches the - // per-upload bound in server.uploadTimeout — anything longer means the - // parent's upload is itself stuck and would have failed on its own. - refreshHeaderBudget = 20 * time.Minute + // on a remote orch when B's runV4 calls Wait(A) here. It must be >= the + // parent's full retry window (server.uploadTotalBudget, 2h); otherwise the + // poll's budget expiry returns a non-retryable "object does not exist" and + // the child gives up while the parent is still retrying. The per-attempt + // context (server.uploadTimeout) bounds the actual poll duration. + refreshHeaderBudget = 2 * time.Hour // uploadDoneChannelPrefix is the Redis pub/sub channel prefix for per-build // upload-finished signals. Empty payload = success; non-empty = upload error. diff --git a/packages/orchestrator/pkg/server/main.go b/packages/orchestrator/pkg/server/main.go index 37c3a0f513..b93db7f7cd 100644 --- a/packages/orchestrator/pkg/server/main.go +++ b/packages/orchestrator/pkg/server/main.go @@ -6,6 +6,7 @@ import ( "context" "fmt" "sync" + "sync/atomic" "time" "github.com/jellydator/ttlcache/v3" @@ -38,6 +39,10 @@ const uploadedBuildsTTL = 1 * time.Hour // MaxStartingInstancesPerNode feature flag and resize the semaphore. const startingSandboxesLimitRefreshInterval = 30 * time.Second +// uploadDrainLogInterval is how often Close logs progress while waiting for +// in-flight snapshot uploads to finish during shutdown. +const uploadDrainLogInterval = 10 * time.Second + type Server struct { orchestrator.UnimplementedSandboxServiceServer orchestrator.UnimplementedChunkServiceServer @@ -58,6 +63,13 @@ type Server struct { uploads *sandbox.Uploads sandboxCreateDuration metric.Int64Histogram sandboxKilledCounter metric.Int64Counter + uploadFailedCounter metric.Int64Counter + + // uploadsWG tracks in-flight async snapshot uploads so a graceful shutdown + // can wait for them to finish instead of dropping them. uploadsInFlight is + // the live count, used to log drain progress during shutdown. + uploadsWG sync.WaitGroup + uploadsInFlight atomic.Int64 done chan struct{} closeOnce sync.Once @@ -123,6 +135,12 @@ func New(ctx context.Context, cfg ServiceConfig) (*Server, error) { } server.sandboxKilledCounter = sandboxKilledCounter + uploadFailedCounter, err := telemetry.GetCounter(meter, telemetry.OrchestratorSnapshotUploadFailedCounterName) + if err != nil { + return nil, fmt.Errorf("failed to register snapshot upload failed counter: %w", err) + } + server.uploadFailedCounter = uploadFailedCounter + _, err = telemetry.GetObservableUpDownCounter(meter, telemetry.OrchestratorSandboxCountMeterName, func(_ context.Context, observer metric.Int64Observer) error { observer.Observe(int64(server.sandboxFactory.Sandboxes.Count())) @@ -156,16 +174,61 @@ func New(ctx context.Context, cfg ServiceConfig) (*Server, error) { return server, nil } -func (s *Server) Close() error { +func (s *Server) Close(ctx context.Context) error { s.closeOnce.Do(func() { close(s.done) }) + // Wait for in-flight snapshot uploads to finish so a graceful shutdown + // doesn't drop a snapshot that is still uploading. ctx is cancelled on a + // forced stop, in which case we stop waiting and let the process exit. + uploadsDone := make(chan struct{}) + go func() { + s.uploadsWG.Wait() + close(uploadsDone) + }() + + s.drainUploads(ctx, uploadsDone) + s.uploadedBuilds.Stop() return nil } +// drainUploads waits for in-flight snapshot uploads to finish, logging progress +// periodically, until they complete or ctx is cancelled (forced stop). +func (s *Server) drainUploads(ctx context.Context, uploadsDone <-chan struct{}) { + inFlight := s.uploadsInFlight.Load() + if inFlight == 0 { + return + } + + logger.L().Info(ctx, "waiting for in-flight snapshot uploads to finish", zap.Int64("uploads", inFlight)) + + ticker := time.NewTicker(uploadDrainLogInterval) + defer ticker.Stop() + + for { + select { + case <-uploadsDone: + logger.L().Info(ctx, "all in-flight snapshot uploads finished") + + return + case <-ctx.Done(): + logger.L().Warn(ctx, "shutting down with snapshot uploads still in flight", + zap.Int64("uploads", s.uploadsInFlight.Load()), + zap.Error(context.Cause(ctx)), + ) + + return + case <-ticker.C: + logger.L().Info(ctx, "still waiting for in-flight snapshot uploads", + zap.Int64("uploads", s.uploadsInFlight.Load()), + ) + } + } +} + func (s *Server) refreshStartingSandboxesLimit(ctx context.Context) { ticker := time.NewTicker(startingSandboxesLimitRefreshInterval) defer ticker.Stop() diff --git a/packages/orchestrator/pkg/server/sandboxes.go b/packages/orchestrator/pkg/server/sandboxes.go index b1783f1381..1584550c16 100644 --- a/packages/orchestrator/pkg/server/sandboxes.go +++ b/packages/orchestrator/pkg/server/sandboxes.go @@ -33,6 +33,7 @@ import ( "github.com/e2b-dev/infra/packages/shared/pkg/grpc/orchestrator" "github.com/e2b-dev/infra/packages/shared/pkg/logger" sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox" + "github.com/e2b-dev/infra/packages/shared/pkg/retry" "github.com/e2b-dev/infra/packages/shared/pkg/storage" "github.com/e2b-dev/infra/packages/shared/pkg/telemetry" "github.com/e2b-dev/infra/packages/shared/pkg/utils" @@ -45,12 +46,25 @@ const ( // acquireTimeout is the max time to wait for a semaphore for resuming sandboxes snapshot. acquireTimeout = 15 * time.Second - // uploadTimeout is the max time allowed for uploading snapshot files to - // remote storage. + // uploadTimeout is the max time allowed for a single upload attempt to + // remote storage. The overall retry window is uploadTotalBudget. uploadTimeout = 20 * time.Minute - // redisPeerKeyTTL is slightly longer than uploadTimeout so the key is still - // valid for the entire upload window before being cleaned up. - redisPeerKeyTTL = uploadTimeout + 2*time.Minute + // uploadTotalBudget bounds how long a snapshot upload is retried before it + // is given up. Covers a long GCS outage without retrying forever. + uploadTotalBudget = 2 * time.Hour + // redisPeerKeyTTL keeps the peer routing key valid across the whole retry + // window so a long retry doesn't drop peer routing mid-upload. It is + // unregistered promptly once the upload finishes (success or give-up). + redisPeerKeyTTL = uploadTotalBudget + 2*time.Minute + + // uploadRetryInitialBackoff is the wait before the first retry; it grows + // exponentially up to uploadRetryMaxBackoff. + uploadRetryInitialBackoff = 5 * time.Second + // uploadRetryMaxBackoff caps the backoff between attempts. + uploadRetryMaxBackoff = 2 * time.Minute + // uploadRetryBackoffMultiplier is the exponential growth factor between + // retry attempts. + uploadRetryBackoffMultiplier = 2 // executionEventDataKey is the key used in webhook event data for sandbox execution metrics. executionEventDataKey = "execution" @@ -860,7 +874,12 @@ func (s *Server) snapshotAndCacheSandbox( return } - s.uploadedBuilds.Set(meta.Template.BuildID, struct{}{}, ttlcache.DefaultTTL) + // Only advertise the build as fully uploaded when it actually landed. + // On abandon/failure the bytes are not in storage, so marking it would + // make chunk-serving falsely report "already uploaded". + if uploadErr == nil { + s.uploadedBuilds.Set(meta.Template.BuildID, struct{}{}, ttlcache.DefaultTTL) + } if err := s.peerRegistry.Unregister(ctx, meta.Template.BuildID); err != nil { logger.L().Warn(ctx, "failed to unregister peer address from routing", zap.String("build_id", meta.Template.BuildID), zap.Error(err)) @@ -885,23 +904,40 @@ func (s *Server) snapshotAndCacheSandbox( // background and cleans up the Redis peer key once done. Used by the Pause // handler where no prefetch data is available. func (s *Server) uploadSnapshotAsync(ctx context.Context, sbx *sandbox.Sandbox, res *snapshotResult) { - ctx, cancel := context.WithTimeout(context.WithoutCancel(ctx), uploadTimeout) + // Detach from the request: the upload retries for up to uploadTotalBudget. + // A graceful shutdown waits for it to finish (see Server.Close via uploadsWG) + // rather than cancelling, so an in-flight snapshot isn't dropped on restart. + uploadCtx := context.WithoutCancel(ctx) - go func() { - defer cancel() + s.uploadsInFlight.Add(1) + s.uploadsWG.Go(func() { + defer s.uploadsInFlight.Add(-1) - ctx, span := tracer.Start(ctx, "upload snapshot") + spanCtx, span := tracer.Start(uploadCtx, "upload snapshot") defer span.End() - err := res.upload.Run(ctx) + err := retry.Do( + spanCtx, + defaultUploadRetryPolicy(), + isRetryableUploadErr, + res.upload.Run, + func(attempt int, backoff time.Duration, err error) { + sbxlogger.I(sbx).Warn(spanCtx, "snapshot upload attempt failed, retrying", + zap.Int("attempt", attempt), + zap.Duration("backoff", backoff), + zap.Error(err), + ) + }, + ) if err != nil { - sbxlogger.I(sbx).Error(ctx, "error uploading snapshot files", zap.Error(err)) + sbxlogger.I(sbx).Error(spanCtx, "snapshot upload did not durably land", zap.Error(err)) + s.uploadFailedCounter.Add(spanCtx, 1) } else { - sbxlogger.I(sbx).Info(ctx, "snapshot finished uploading successfully") + sbxlogger.I(sbx).Info(spanCtx, "snapshot finished uploading successfully") } - res.completeUpload(ctx, err) - }() + res.completeUpload(spanCtx, err) + }) } // setupSandboxLifecycle sets up the cleanup goroutine for a sandbox. diff --git a/packages/orchestrator/pkg/server/upload_retry.go b/packages/orchestrator/pkg/server/upload_retry.go new file mode 100644 index 0000000000..dfcbd34cb9 --- /dev/null +++ b/packages/orchestrator/pkg/server/upload_retry.go @@ -0,0 +1,44 @@ +//go:build linux + +package server + +import ( + "context" + "errors" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/build" + "github.com/e2b-dev/infra/packages/shared/pkg/retry" + "github.com/e2b-dev/infra/packages/shared/pkg/storage" +) + +// defaultUploadRetryPolicy is the retry policy for pause-snapshot uploads: +// retry with a fresh per-attempt timeout under the total budget, with +// exponential backoff. +func defaultUploadRetryPolicy() retry.Policy { + return retry.Policy{ + TotalBudget: uploadTotalBudget, + AttemptTimeout: uploadTimeout, + InitialBackoff: uploadRetryInitialBackoff, + MaxBackoff: uploadRetryMaxBackoff, + Multiplier: uploadRetryBackoffMultiplier, + } +} + +// isRetryableUploadErr classifies an upload failure. The default is RETRYABLE: +// a lost snapshot is unrecoverable and cascades to descendants, so a wasted +// retry is far cheaper than dropping a recoverable build. Only genuinely +// terminal conditions stop the loop. +func isRetryableUploadErr(err error) bool { + switch { + case errors.Is(err, build.NoDiffError{}): + return false // nothing to upload + case errors.Is(err, storage.ErrObjectNotExist): + return false // source vanished; retry cannot recover it + case errors.Is(err, context.Canceled): + return false // parent cancelled (shutdown) + default: + // Includes per-attempt context.DeadlineExceeded, GCS 401/503, rate + // limiting, and unknown errors — all worth retrying within the budget. + return true + } +} diff --git a/packages/orchestrator/pkg/server/upload_retry_test.go b/packages/orchestrator/pkg/server/upload_retry_test.go new file mode 100644 index 0000000000..1d0ac73c5a --- /dev/null +++ b/packages/orchestrator/pkg/server/upload_retry_test.go @@ -0,0 +1,40 @@ +//go:build linux + +package server + +import ( + "context" + "errors" + "fmt" + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/e2b-dev/infra/packages/orchestrator/pkg/sandbox/build" + "github.com/e2b-dev/infra/packages/shared/pkg/storage" +) + +func TestIsRetryableUploadErr(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + err error + retryable bool + }{ + {"no diff", build.NoDiffError{}, false}, + {"object not exist", storage.ErrObjectNotExist, false}, + {"object not exist wrapped", fmt.Errorf("load: %w", storage.ErrObjectNotExist), false}, + {"parent cancelled", context.Canceled, false}, + {"per-attempt deadline", context.DeadlineExceeded, true}, + {"gcs 503", errors.New("server error (503)"), true}, + {"unknown", errors.New("boom"), true}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assert.Equal(t, tt.retryable, isRetryableUploadErr(tt.err)) + }) + } +} diff --git a/packages/shared/pkg/retry/retry.go b/packages/shared/pkg/retry/retry.go new file mode 100644 index 0000000000..aec73c23b1 --- /dev/null +++ b/packages/shared/pkg/retry/retry.go @@ -0,0 +1,121 @@ +package retry + +import ( + "context" + "errors" + "fmt" + "time" +) + +// ErrBudgetExhausted is returned (wrapped) when fn never succeeded within +// Policy.TotalBudget. +var ErrBudgetExhausted = errors.New("retry budget exhausted") + +// Policy configures Do. +type Policy struct { + // TotalBudget bounds the wall-clock time across all attempts. Required + // (a non-positive value makes Do fail immediately with ErrBudgetExhausted). + TotalBudget time.Duration + // AttemptTimeout bounds a single attempt. 0 means no per-attempt timeout — + // each attempt is bounded only by the remaining budget. + AttemptTimeout time.Duration + // InitialBackoff is the wait before the first retry. + InitialBackoff time.Duration + // MaxBackoff caps the backoff between attempts. 0 means uncapped. + MaxBackoff time.Duration + // Multiplier is the exponential growth factor between attempts (< 1 is + // treated as 1, i.e. constant backoff). + Multiplier int +} + +// Do runs fn with retries until it returns nil, retryable reports the error as +// non-retryable, the budget is exhausted, or ctx is cancelled. +// +// The whole call runs under a single budget context derived from ctx, so each +// attempt's context is capped to the remaining budget and the loop never runs +// past TotalBudget. fn must respect the context it is given. +// +// retryable classifies an error as worth retrying; a nil retryable treats every +// error as retryable. onRetry, if non-nil, is called before each backoff sleep +// with the 1-based attempt number, the upcoming backoff, and the error that +// triggered the retry. +func Do( + ctx context.Context, + policy Policy, + retryable func(error) bool, + fn func(context.Context) error, + onRetry func(attempt int, backoff time.Duration, err error), +) error { + budgetCtx, cancel := context.WithTimeoutCause(ctx, policy.TotalBudget, ErrBudgetExhausted) + defer cancel() + + backoff := policy.InitialBackoff + + for attempt := 1; ; attempt++ { + err := runAttempt(budgetCtx, policy.AttemptTimeout, fn) + if err == nil { + return nil + } + + // Budget exhausted or parent cancelled: stop. Checking the budget + // context (not err) distinguishes these from a per-attempt timeout, + // which leaves budgetCtx alive and is retryable. + if budgetCtx.Err() != nil { + return stopError(budgetCtx, attempt, err) + } + + if retryable != nil && !retryable(err) { + return fmt.Errorf("non-retryable error after %d attempts: %w", attempt, err) + } + + if onRetry != nil { + onRetry(attempt, backoff, err) + } + + select { + case <-budgetCtx.Done(): + return stopError(budgetCtx, attempt, err) + case <-time.After(backoff): + } + + backoff = nextBackoff(backoff, policy.MaxBackoff, policy.Multiplier) + } +} + +// runAttempt runs a single attempt under a fresh per-attempt timeout derived +// from ctx. Because the attempt context derives from the budget context, its +// effective deadline is min(attemptTimeout, remaining budget). +func runAttempt(ctx context.Context, attemptTimeout time.Duration, fn func(context.Context) error) error { + if attemptTimeout <= 0 { + return fn(ctx) + } + + attemptCtx, cancel := context.WithTimeout(ctx, attemptTimeout) + defer cancel() + + return fn(attemptCtx) +} + +func nextBackoff(cur, maxBackoff time.Duration, multiplier int) time.Duration { + if multiplier < 1 { + multiplier = 1 + } + + next := cur * time.Duration(multiplier) + if maxBackoff > 0 && next > maxBackoff { + return maxBackoff + } + + return next +} + +// stopError maps a stopped budget context to a terminal error: budget +// exhaustion vs. parent cancellation (e.g. caller shutdown). +func stopError(budgetCtx context.Context, attempt int, lastErr error) error { + cause := context.Cause(budgetCtx) + if errors.Is(cause, ErrBudgetExhausted) { + return fmt.Errorf("%w after %d attempts: %w", ErrBudgetExhausted, attempt, lastErr) + } + + return errors.Join(lastErr, cause) +} diff --git a/packages/shared/pkg/retry/retry_test.go b/packages/shared/pkg/retry/retry_test.go new file mode 100644 index 0000000000..ce764ef705 --- /dev/null +++ b/packages/shared/pkg/retry/retry_test.go @@ -0,0 +1,175 @@ +package retry + +import ( + "context" + "errors" + "sync/atomic" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func fastPolicy() Policy { + return Policy{ + TotalBudget: 2 * time.Second, + AttemptTimeout: 50 * time.Millisecond, + InitialBackoff: time.Millisecond, + MaxBackoff: 5 * time.Millisecond, + Multiplier: 2, + } +} + +func TestDo_RetriesThenSucceeds(t *testing.T) { + t.Parallel() + + var attempts atomic.Int32 + fn := func(context.Context) error { + if attempts.Add(1) < 3 { + return errors.New("transient") + } + + return nil + } + + require.NoError(t, Do(context.Background(), fastPolicy(), nil, fn, nil)) + assert.EqualValues(t, 3, attempts.Load()) +} + +func TestDo_BudgetExhausted(t *testing.T) { + t.Parallel() + + var attempts atomic.Int32 + fn := func(context.Context) error { + attempts.Add(1) + + return errors.New("persistent") + } + + err := Do(context.Background(), fastPolicy(), nil, fn, nil) + require.Error(t, err) + require.ErrorIs(t, err, ErrBudgetExhausted) + assert.Greater(t, attempts.Load(), int32(1)) +} + +func TestDo_PerAttemptTimeoutDoesNotAbortLoop(t *testing.T) { + t.Parallel() + + var attempts atomic.Int32 + fn := func(ctx context.Context) error { + if attempts.Add(1) == 1 { + <-ctx.Done() // first attempt blows its per-attempt deadline + + return ctx.Err() + } + + return nil + } + + require.NoError(t, Do(context.Background(), fastPolicy(), nil, fn, nil)) + assert.EqualValues(t, 2, attempts.Load()) +} + +func TestDo_CapsAttemptToRemainingBudget(t *testing.T) { + t.Parallel() + + // AttemptTimeout (10s) far exceeds the budget (100ms): a blocking attempt + // must be cut off at the budget, not run for the full per-attempt timeout. + policy := Policy{ + TotalBudget: 100 * time.Millisecond, + AttemptTimeout: 10 * time.Second, + InitialBackoff: time.Millisecond, + MaxBackoff: 5 * time.Millisecond, + Multiplier: 2, + } + + fn := func(ctx context.Context) error { + <-ctx.Done() + + return ctx.Err() + } + + start := time.Now() + err := Do(context.Background(), policy, nil, fn, nil) + elapsed := time.Since(start) + + require.ErrorIs(t, err, ErrBudgetExhausted) + assert.Less(t, elapsed, 2*time.Second) +} + +func TestDo_NonRetryableStops(t *testing.T) { + t.Parallel() + + sentinel := errors.New("permanent") + var attempts atomic.Int32 + fn := func(context.Context) error { + attempts.Add(1) + + return sentinel + } + retryable := func(err error) bool { return !errors.Is(err, sentinel) } + + err := Do(context.Background(), fastPolicy(), retryable, fn, nil) + require.ErrorIs(t, err, sentinel) + assert.EqualValues(t, 1, attempts.Load()) +} + +func TestDo_ParentCancelAborts(t *testing.T) { + t.Parallel() + + ctx, cancel := context.WithCancel(context.Background()) + fn := func(context.Context) error { + cancel() + + return errors.New("failed before cancel observed") + } + + err := Do(ctx, fastPolicy(), nil, fn, nil) + require.Error(t, err) + require.ErrorIs(t, err, context.Canceled) + assert.NotErrorIs(t, err, ErrBudgetExhausted) +} + +func TestDo_NoAttemptTimeoutUsesBudget(t *testing.T) { + t.Parallel() + + // AttemptTimeout == 0: the attempt is bounded only by the remaining budget. + policy := Policy{ + TotalBudget: 80 * time.Millisecond, + AttemptTimeout: 0, + InitialBackoff: time.Millisecond, + MaxBackoff: 5 * time.Millisecond, + Multiplier: 2, + } + + fn := func(ctx context.Context) error { + <-ctx.Done() + + return ctx.Err() + } + + start := time.Now() + err := Do(context.Background(), policy, nil, fn, nil) + + require.ErrorIs(t, err, ErrBudgetExhausted) + assert.Less(t, time.Since(start), time.Second) +} + +func TestDo_OnRetryInvoked(t *testing.T) { + t.Parallel() + + var retries atomic.Int32 + var attempts atomic.Int32 + fn := func(context.Context) error { + if attempts.Add(1) < 3 { + return errors.New("transient") + } + + return nil + } + onRetry := func(int, time.Duration, error) { retries.Add(1) } + + require.NoError(t, Do(context.Background(), fastPolicy(), nil, fn, onRetry)) + assert.EqualValues(t, 2, retries.Load(), "onRetry fires once per retry (not the final success)") +} diff --git a/packages/shared/pkg/telemetry/meters.go b/packages/shared/pkg/telemetry/meters.go index bde79e2e65..21fd8920bd 100644 --- a/packages/shared/pkg/telemetry/meters.go +++ b/packages/shared/pkg/telemetry/meters.go @@ -33,6 +33,11 @@ const ( OrchestratorSandboxKilledCounterName CounterType = "orchestrator.sandbox.killed" + // OrchestratorSnapshotUploadFailedCounterName counts pause-snapshot uploads + // that never landed durably (budget exhausted or a non-retryable error). + // A non-zero rate means lost snapshots. + OrchestratorSnapshotUploadFailedCounterName CounterType = "orchestrator.snapshot.upload.failed" + ApiRedisStoragePublisherPublished CounterType = "api.redis_storage.publisher.published" ApiRedisStoragePublisherDropped CounterType = "api.redis_storage.publisher.dropped" ) @@ -166,17 +171,18 @@ const ( ) var counterDesc = map[CounterType]string{ - SandboxCreateMeterName: "Number of currently waiting requests to create a new sandbox", - ApiOrchestratorCreatedSandboxes: "Number of successfully created sandboxes", - BuildResultCounterName: "Number of template build results", - BuildCacheResultCounterName: "Number of build cache results", - TeamSandboxCreated: "Counter of started sandboxes for the team in the interval", - OrchestratorHostBalanceDirtyPagesThreads: "Cumulative stalled thread-polls during sandbox resume; rate() gives throttle intensity", - EnvdInitCalls: "Number of envd initialization calls", - OrchestratorSandboxKilledCounterName: "Number of sandboxes killed, labeled by kill reason", - TCPFirewallConnectionsTotal: "Total number of TCP firewall connections processed", - TCPFirewallErrorsTotal: "Total number of TCP firewall errors", - TCPFirewallDecisionsTotal: "Total number of TCP firewall allow/block decisions", + SandboxCreateMeterName: "Number of currently waiting requests to create a new sandbox", + ApiOrchestratorCreatedSandboxes: "Number of successfully created sandboxes", + BuildResultCounterName: "Number of template build results", + BuildCacheResultCounterName: "Number of build cache results", + TeamSandboxCreated: "Counter of started sandboxes for the team in the interval", + OrchestratorHostBalanceDirtyPagesThreads: "Cumulative stalled thread-polls during sandbox resume; rate() gives throttle intensity", + EnvdInitCalls: "Number of envd initialization calls", + OrchestratorSandboxKilledCounterName: "Number of sandboxes killed, labeled by kill reason", + OrchestratorSnapshotUploadFailedCounterName: "Number of pause-snapshot uploads that never landed durably", + TCPFirewallConnectionsTotal: "Total number of TCP firewall connections processed", + TCPFirewallErrorsTotal: "Total number of TCP firewall errors", + TCPFirewallDecisionsTotal: "Total number of TCP firewall allow/block decisions", IngressProxyConnectionsBlockedTotal: "Total number of ingress proxy connections blocked by connection limit", CmuxErrorsTotal: "Total number of cmux connection multiplexer errors", @@ -193,17 +199,18 @@ var counterDesc = map[CounterType]string{ } var counterUnits = map[CounterType]string{ - SandboxCreateMeterName: "{sandbox}", - ApiOrchestratorCreatedSandboxes: "{sandbox}", - BuildResultCounterName: "{build}", - BuildCacheResultCounterName: "{layer}", - TeamSandboxCreated: "{sandbox}", - OrchestratorHostBalanceDirtyPagesThreads: "{thread}", - EnvdInitCalls: "1", - OrchestratorSandboxKilledCounterName: "{sandbox}", - TCPFirewallConnectionsTotal: "{connection}", - TCPFirewallErrorsTotal: "{error}", - TCPFirewallDecisionsTotal: "{decision}", + SandboxCreateMeterName: "{sandbox}", + ApiOrchestratorCreatedSandboxes: "{sandbox}", + BuildResultCounterName: "{build}", + BuildCacheResultCounterName: "{layer}", + TeamSandboxCreated: "{sandbox}", + OrchestratorHostBalanceDirtyPagesThreads: "{thread}", + EnvdInitCalls: "1", + OrchestratorSandboxKilledCounterName: "{sandbox}", + OrchestratorSnapshotUploadFailedCounterName: "{snapshot}", + TCPFirewallConnectionsTotal: "{connection}", + TCPFirewallErrorsTotal: "{error}", + TCPFirewallDecisionsTotal: "{decision}", IngressProxyConnectionsBlockedTotal: "{connection}", CmuxErrorsTotal: "{error}",