diff --git a/pkg/authserver/runner/dcr.go b/pkg/authserver/runner/dcr.go index a850e5fa65..6dde6158e0 100644 --- a/pkg/authserver/runner/dcr.go +++ b/pkg/authserver/runner/dcr.go @@ -73,6 +73,16 @@ var authMethodPreference = []string{ // // The struct is the unit of storage in dcrResolutionCache and the unit of // application via consumeResolution. +// +// MUST update both converters (resolutionToCredentials and +// credentialsToResolution in dcr_store.go) when adding, renaming, or +// removing a field here. The two converters are the seam between this +// runner-side type and the persisted *storage.DCRCredentials shape; a +// field added here without a paired converter update will silently fail +// to round-trip across an authserver restart, the exact "parallel types +// drift" failure mode .claude/rules/go-style.md warns about. The +// round-trip behaviour is pinned by TestResolutionCredentialsRoundTrip +// in dcr_store_test.go. type DCRResolution struct { // ClientID is the RFC 7591 "client_id" returned by the authorization // server. diff --git a/pkg/authserver/runner/dcr_store.go b/pkg/authserver/runner/dcr_store.go index 07ef0a2985..7b03334c13 100644 --- a/pkg/authserver/runner/dcr_store.go +++ b/pkg/authserver/runner/dcr_store.go @@ -5,8 +5,8 @@ package runner import ( "context" + "errors" "fmt" - "sync" "time" "github.com/stacklok/toolhive/pkg/authserver/storage" @@ -30,16 +30,18 @@ type DCRKey = storage.DCRKey // keyed by the (Issuer, RedirectURI, ScopesHash) tuple. Implementations must // be safe for concurrent use. // -// This is a runner-internal cache of *DCRResolution values; it is distinct -// from the persistent storage.DCRCredentialStore (which holds *DCRCredentials -// and is the durable contract sub-issue 3 wires the resolver to use). Naming -// them differently keeps the two interfaces unambiguous to readers and grep -// tooling while both exist during the Phase 3 migration. +// This is the runner-facing interface used by the DCR resolver. It is a +// narrow re-projection of storage.DCRCredentialStore that exchanges +// *DCRResolution values (the resolver's working type) instead of +// *storage.DCRCredentials so the resolver internals stay agnostic to the +// persistence layer's exact field shape. // -// The cache is in-memory and holds long-lived registrations — entries are -// never expired or evicted by the cache itself. Callers are responsible for -// invalidating entries when the underlying registration is revoked (e.g., -// via RFC 7592 deregistration). +// Implementations in this package are thin adapters around a +// storage.DCRCredentialStore — the durable map / Redis hash lives over +// there, and this interface adds a per-call DCRResolution <-> DCRCredentials +// translation. There is exactly one persistence implementation per backend: +// storage.MemoryStorage and storage.RedisStorage. See newStorageBackedStore +// for the adapter. type dcrResolutionCache interface { // Get returns the cached resolution for key, or (nil, false, nil) if the // key is not present. An error is returned only on backend failure. @@ -52,70 +54,36 @@ type dcrResolutionCache interface { Put(ctx context.Context, key DCRKey, resolution *DCRResolution) error } -// newInMemoryDCRResolutionCache returns a thread-safe in-memory -// dcrResolutionCache intended for tests and single-replica development -// deployments. Production deployments should use the Redis-backed store -// introduced in Phase 3, which addresses the cross-replica sharing, -// durability, and cross-process coordination gaps documented below. -// -// Entries are retained for the process lifetime; there is no TTL and no -// background cleanup goroutine. Growth is bounded by upstream count × -// distinct scope sets ever registered for each upstream during the -// process lifetime — for a stable configuration this collapses to the -// upstream count, but rotating scope sets (operator-driven scope -// changes, or upstream scopes_supported rotations re-derived by the -// resolver) accumulate stale entries that survive until restart. This -// implementation is not the production answer; the Redis backend -// introduced in Phase 3 mitigates the rotation case via SetEX TTL. -// -// What this enables: serialises Get/Put against a single in-process map so -// concurrent callers within one authserver process see a consistent view of -// the cache without redundant RFC 7591 registrations. -// -// What this does NOT solve: -// - Cross-replica sharing: each replica holds its own independent map, so a -// registration performed on replica A is not visible to replica B. In a -// multi-replica deployment every replica will register its own DCR client -// against the upstream on first boot. Phase 3 introduces a Redis-backed -// store that addresses this. -// - Durability across restarts: process exit drops every entry; the next -// boot re-registers. Operators relying on stable client_ids must use a -// persistent backend. -// - Cross-process write coordination: two processes (or replicas) calling -// Put for the same DCRKey concurrently will both succeed against their -// local maps; whichever registration the upstream accepts last wins on -// that side, the loser becomes orphaned. The -// resolveDCRCredentials-level singleflight in dcr.go only deduplicates -// within one process. -func newInMemoryDCRResolutionCache() dcrResolutionCache { - return &inMemoryDCRResolutionCache{ - entries: make(map[DCRKey]*DCRResolution), - } +// newStorageBackedStore returns a dcrResolutionCache that delegates to a +// storage.DCRCredentialStore for durable persistence and translates +// DCRResolution values into DCRCredentials at the boundary. The returned +// store is safe for concurrent use because the underlying +// storage.DCRCredentialStore must be (per its interface contract). +func newStorageBackedStore(backend storage.DCRCredentialStore) dcrResolutionCache { + return &storageBackedStore{backend: backend} } -// inMemoryDCRResolutionCache is the default dcrResolutionCache backed by a -// plain map guarded by sync.RWMutex. Modelled on -// pkg/authserver/storage/memory.go but stripped of TTL bookkeeping — DCR -// resolutions are long-lived. -type inMemoryDCRResolutionCache struct { - mu sync.RWMutex - entries map[DCRKey]*DCRResolution +// storageBackedStore is the runner-side dcrResolutionCache wrapping a +// storage.DCRCredentialStore. Its methods are the only place that converts +// between the resolver's *DCRResolution and the persisted +// *storage.DCRCredentials shapes. +type storageBackedStore struct { + backend storage.DCRCredentialStore } // Get implements dcrResolutionCache. -func (s *inMemoryDCRResolutionCache) Get(_ context.Context, key DCRKey) (*DCRResolution, bool, error) { - s.mu.RLock() - defer s.mu.RUnlock() - - res, ok := s.entries[key] - if !ok { - return nil, false, nil +// +// A storage-level ErrNotFound is translated into the (nil, false, nil) +// miss-tuple advertised by the interface. Other errors propagate as-is. +func (s *storageBackedStore) Get(ctx context.Context, key DCRKey) (*DCRResolution, bool, error) { + creds, err := s.backend.GetDCRCredentials(ctx, key) + if err != nil { + if errors.Is(err, storage.ErrNotFound) { + return nil, false, nil + } + return nil, false, err } - // Return a defensive copy so mutations by the caller never reach the - // cache entry — internal maps and pointers must not be reachable from - // the caller's value. - cp := *res - return &cp, true, nil + return credentialsToResolution(creds), true, nil } // Put implements dcrResolutionCache. @@ -124,16 +92,70 @@ func (s *inMemoryDCRResolutionCache) Get(_ context.Context, key DCRKey) (*DCRRes // passing nil would otherwise get a successful return, observe a miss on // the next Get, and have no error trail to debug from. Failing loudly at // the boundary makes such bugs visible at the first call. -func (s *inMemoryDCRResolutionCache) Put(_ context.Context, key DCRKey, resolution *DCRResolution) error { +func (s *storageBackedStore) Put(ctx context.Context, key DCRKey, resolution *DCRResolution) error { if resolution == nil { return fmt.Errorf("dcr: resolution must not be nil") } - s.mu.Lock() - defer s.mu.Unlock() + creds := resolutionToCredentials(key, resolution) + return s.backend.StoreDCRCredentials(ctx, creds) +} + +// resolutionToCredentials converts a resolver-side *DCRResolution into the +// persisted *storage.DCRCredentials shape. The DCRKey is supplied separately +// because storage.DCRCredentials carries the key as a struct field rather +// than implicitly via a map key, so the persistence layer can round-trip it +// across processes and backends. +// +// Fields that exist on DCRResolution but not on DCRCredentials are dropped: +// - ClientIDIssuedAt: informational only per RFC 7591 §3.2.1; the resolver +// does not consult it for cache invalidation, so it does not need to +// survive a process restart. +// - RedirectURI: already encoded into key.RedirectURI; storing it twice +// would risk drift between the canonical key and the persisted value. +// +// CreatedAt and ClientSecretExpiresAt are preserved so cache observers +// (e.g. lookupCachedResolution's staleness Warn) and TTL-aware backends +// (Redis) keep their existing behaviour after a restart. +func resolutionToCredentials(key DCRKey, res *DCRResolution) *storage.DCRCredentials { + if res == nil { + return nil + } + return &storage.DCRCredentials{ + Key: key, + ClientID: res.ClientID, + ClientSecret: res.ClientSecret, + TokenEndpointAuthMethod: res.TokenEndpointAuthMethod, + RegistrationAccessToken: res.RegistrationAccessToken, + RegistrationClientURI: res.RegistrationClientURI, + AuthorizationEndpoint: res.AuthorizationEndpoint, + TokenEndpoint: res.TokenEndpoint, + CreatedAt: res.CreatedAt, + ClientSecretExpiresAt: res.ClientSecretExpiresAt, + } +} - // Defensive copy so the caller's subsequent mutations do not reach the - // cache entry. - cp := *resolution - s.entries[key] = &cp - return nil +// credentialsToResolution is the inverse of resolutionToCredentials. The +// RedirectURI is recovered from the persisted Key so consumers that read it +// off the resolution (e.g. consumeResolution, which writes it back onto a +// run-config copy when the caller left it empty) see the canonical value. +// +// ClientIDIssuedAt is left zero because it is not persisted. Callers that +// care about it (none today) must read it directly from the live RFC 7591 +// response, not from a cached resolution. +func credentialsToResolution(creds *storage.DCRCredentials) *DCRResolution { + if creds == nil { + return nil + } + return &DCRResolution{ + ClientID: creds.ClientID, + ClientSecret: creds.ClientSecret, + AuthorizationEndpoint: creds.AuthorizationEndpoint, + TokenEndpoint: creds.TokenEndpoint, + RegistrationAccessToken: creds.RegistrationAccessToken, + RegistrationClientURI: creds.RegistrationClientURI, + TokenEndpointAuthMethod: creds.TokenEndpointAuthMethod, + RedirectURI: creds.Key.RedirectURI, + ClientSecretExpiresAt: creds.ClientSecretExpiresAt, + CreatedAt: creds.CreatedAt, + } } diff --git a/pkg/authserver/runner/dcr_store_test.go b/pkg/authserver/runner/dcr_store_test.go index 40668879ed..4f61fa79fa 100644 --- a/pkg/authserver/runner/dcr_store_test.go +++ b/pkg/authserver/runner/dcr_store_test.go @@ -17,10 +17,10 @@ import ( "github.com/stacklok/toolhive/pkg/authserver/storage" ) -func TestInMemoryDCRResolutionCache_PutGet_RoundTrip(t *testing.T) { +func TestStorageBackedStore_PutGet_RoundTrip(t *testing.T) { t.Parallel() - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) ctx := context.Background() key := DCRKey{ @@ -53,10 +53,10 @@ func TestInMemoryDCRResolutionCache_PutGet_RoundTrip(t *testing.T) { assert.Equal(t, resolution.TokenEndpointAuthMethod, got.TokenEndpointAuthMethod) } -func TestInMemoryDCRResolutionCache_Get_MissingKey(t *testing.T) { +func TestStorageBackedStore_Get_MissingKey(t *testing.T) { t.Parallel() - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) ctx := context.Background() got, ok, err := store.Get(ctx, DCRKey{Issuer: "https://unknown.example.com"}) @@ -65,10 +65,10 @@ func TestInMemoryDCRResolutionCache_Get_MissingKey(t *testing.T) { assert.Nil(t, got) } -func TestInMemoryDCRResolutionCache_DistinctKeysDoNotCollide(t *testing.T) { +func TestStorageBackedStore_DistinctKeysDoNotCollide(t *testing.T) { t.Parallel() - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) ctx := context.Background() keyA := DCRKey{ @@ -92,10 +92,22 @@ func TestInMemoryDCRResolutionCache_DistinctKeysDoNotCollide(t *testing.T) { ScopesHash: storage.ScopesHash([]string{"openid", "email"}), } - require.NoError(t, store.Put(ctx, keyA, &DCRResolution{ClientID: "a"})) - require.NoError(t, store.Put(ctx, keyB, &DCRResolution{ClientID: "b"})) - require.NoError(t, store.Put(ctx, keyC, &DCRResolution{ClientID: "c"})) - require.NoError(t, store.Put(ctx, keyD, &DCRResolution{ClientID: "d"})) + // The persisted *storage.DCRCredentials shape requires non-empty + // AuthorizationEndpoint / TokenEndpoint per validateDCRCredentialsForStore; + // supply a minimal valid resolution so the storage-backed adapter accepts + // the Put, since the test asserts key-distinctness and not field shape. + resolution := func(clientID string) *DCRResolution { + return &DCRResolution{ + ClientID: clientID, + AuthorizationEndpoint: "https://idp.example.com/authorize", + TokenEndpoint: "https://idp.example.com/token", + } + } + + require.NoError(t, store.Put(ctx, keyA, resolution("a"))) + require.NoError(t, store.Put(ctx, keyB, resolution("b"))) + require.NoError(t, store.Put(ctx, keyC, resolution("c"))) + require.NoError(t, store.Put(ctx, keyD, resolution("d"))) for _, tc := range []struct { key DCRKey @@ -113,15 +125,34 @@ func TestInMemoryDCRResolutionCache_DistinctKeysDoNotCollide(t *testing.T) { } } -func TestInMemoryDCRResolutionCache_Put_OverwritesExisting(t *testing.T) { +func TestStorageBackedStore_Put_OverwritesExisting(t *testing.T) { t.Parallel() - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) ctx := context.Background() - key := DCRKey{Issuer: "https://idp.example.com", RedirectURI: "https://x.example.com/cb"} - require.NoError(t, store.Put(ctx, key, &DCRResolution{ClientID: "first"})) - require.NoError(t, store.Put(ctx, key, &DCRResolution{ClientID: "second"})) + key := DCRKey{ + Issuer: "https://idp.example.com", + RedirectURI: "https://x.example.com/cb", + ScopesHash: storage.ScopesHash([]string{"openid"}), + } + endpoints := struct { + Authorization string + Token string + }{ + Authorization: "https://idp.example.com/authorize", + Token: "https://idp.example.com/token", + } + require.NoError(t, store.Put(ctx, key, &DCRResolution{ + ClientID: "first", + AuthorizationEndpoint: endpoints.Authorization, + TokenEndpoint: endpoints.Token, + })) + require.NoError(t, store.Put(ctx, key, &DCRResolution{ + ClientID: "second", + AuthorizationEndpoint: endpoints.Authorization, + TokenEndpoint: endpoints.Token, + })) got, ok, err := store.Get(ctx, key) require.NoError(t, err) @@ -129,14 +160,14 @@ func TestInMemoryDCRResolutionCache_Put_OverwritesExisting(t *testing.T) { assert.Equal(t, "second", got.ClientID) } -// TestInMemoryDCRResolutionCache_Put_RejectsNilResolution pins the +// TestStorageBackedStore_Put_RejectsNilResolution pins the // fail-loud-on-invalid-input contract: passing nil must error rather than // silently no-op. A silent no-op would leave the caller with a successful // Put followed by a Get miss and no debug trail to explain it. -func TestInMemoryDCRResolutionCache_Put_RejectsNilResolution(t *testing.T) { +func TestStorageBackedStore_Put_RejectsNilResolution(t *testing.T) { t.Parallel() - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) ctx := context.Background() key := DCRKey{Issuer: "https://idp.example.com", RedirectURI: "https://x.example.com/cb"} @@ -150,14 +181,22 @@ func TestInMemoryDCRResolutionCache_Put_RejectsNilResolution(t *testing.T) { assert.False(t, ok, "rejected Put must not leave any entry behind") } -func TestInMemoryDCRResolutionCache_GetReturnsDefensiveCopy(t *testing.T) { +func TestStorageBackedStore_GetReturnsDefensiveCopy(t *testing.T) { t.Parallel() - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) ctx := context.Background() - key := DCRKey{Issuer: "https://idp.example.com"} - require.NoError(t, store.Put(ctx, key, &DCRResolution{ClientID: "orig"})) + key := DCRKey{ + Issuer: "https://idp.example.com", + RedirectURI: "https://x.example.com/cb", + ScopesHash: storage.ScopesHash([]string{"openid"}), + } + require.NoError(t, store.Put(ctx, key, &DCRResolution{ + ClientID: "orig", + AuthorizationEndpoint: "https://idp.example.com/authorize", + TokenEndpoint: "https://idp.example.com/token", + })) got, ok, err := store.Get(ctx, key) require.NoError(t, err) @@ -175,19 +214,21 @@ func TestInMemoryDCRResolutionCache_GetReturnsDefensiveCopy(t *testing.T) { // Duplicating the suite here would re-exercise the same code, which is // redundant per .claude/rules/testing.md. -// TestInMemoryDCRResolutionCache_ConcurrentAccess fans out N goroutines +// TestStorageBackedStore_ConcurrentAccess fans out N goroutines // performing alternating Put / Get against overlapping and disjoint keys, -// exercising the sync.RWMutex guard advertised in the dcrResolutionCache -// interface doc. With go test -race this catches any future change that -// drops the lock or introduces a data race in the map access. +// exercising the safe-for-concurrent-use contract advertised on the +// dcrResolutionCache interface. The contract is satisfied by the underlying +// storage.DCRCredentialStore (which holds the lock); this test runs under +// `go test -race` so any future regression that drops the storage backend's +// guarantee, or an adapter change that races on its own state, fails loudly. // // The test is bounded by a fail-fast deadline so a regression that // deadlocks fails loudly with a clear message rather than hanging until // the global Go test timeout. -func TestInMemoryDCRResolutionCache_ConcurrentAccess(t *testing.T) { +func TestStorageBackedStore_ConcurrentAccess(t *testing.T) { t.Parallel() - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) const ( workers = 16 @@ -221,8 +262,10 @@ func TestInMemoryDCRResolutionCache_ConcurrentAccess(t *testing.T) { ctx := context.Background() for i := 0; i < opsPerWorker; i++ { resolution := &DCRResolution{ - ClientID: fmt.Sprintf("worker-%d-op-%d", worker, i), - CreatedAt: time.Now(), + ClientID: fmt.Sprintf("worker-%d-op-%d", worker, i), + AuthorizationEndpoint: "https://idp.example.com/authorize", + TokenEndpoint: "https://idp.example.com/token", + CreatedAt: time.Now(), } if i%2 == 0 { if err := store.Put(ctx, overlappingKey(i), resolution); err != nil { @@ -255,3 +298,104 @@ func TestInMemoryDCRResolutionCache_ConcurrentAccess(t *testing.T) { assert.Zero(t, atomic.LoadInt32(&errCount), "no Get/Put should have errored under concurrent access") } + +// TestResolutionCredentialsRoundTrip pins the field-by-field contract +// between resolutionToCredentials and credentialsToResolution: which +// fields survive a round-trip, which are intentionally dropped, and +// which are recovered from the persisted DCRKey. The test exists +// because the two converters are the seam where a field added to either +// DCRResolution or storage.DCRCredentials must be paired with an update +// here; without coverage, a future field addition would silently fail +// to persist across an authserver restart. +// +// The "preserved" group asserts equality on round-tripped values. The +// "dropped" group asserts that the post-round-trip value is the type's +// zero value (ClientIDIssuedAt is informational per RFC 7591 §3.2.1 and +// is not persisted). RedirectURI is in its own group because it is +// dropped from DCRCredentials and recovered via Key.RedirectURI on +// read. +// +// ProviderName is the one DCRCredentials field with no DCRResolution +// counterpart. It is documented in storage.DCRCredentials as "debug / +// audit only — never used as a primary key" and no current consumer +// reads it. The decision to leave it unpopulated by the runner is +// recorded here so a future contributor adding ProviderName threading +// has a single grep target. +func TestResolutionCredentialsRoundTrip(t *testing.T) { + t.Parallel() + + now := time.Now().UTC().Round(time.Second) + expiry := now.Add(30 * 24 * time.Hour) + + key := DCRKey{ + Issuer: "https://idp.example.com", + RedirectURI: "https://thv.example.com/oauth/callback", + ScopesHash: storage.ScopesHash([]string{"openid", "profile"}), + } + + original := &DCRResolution{ + ClientID: "round-trip-client-id", + ClientSecret: "round-trip-secret", + AuthorizationEndpoint: "https://idp.example.com/authorize", + TokenEndpoint: "https://idp.example.com/token", + RegistrationAccessToken: "rfc7592-token", + RegistrationClientURI: "https://idp.example.com/register/round-trip-client-id", + TokenEndpointAuthMethod: "client_secret_basic", + // RedirectURI is recovered from Key on read; pre-populate it on + // the input to confirm it survives via the key, not via a + // dedicated field on DCRCredentials. + RedirectURI: key.RedirectURI, + ClientIDIssuedAt: now, // intentionally dropped on persist + ClientSecretExpiresAt: expiry, + CreatedAt: now, + } + + creds := resolutionToCredentials(key, original) + require.NotNil(t, creds) + + // Persisted-side assertions: which fields the converter writes to + // DCRCredentials, which it leaves zero (the asymmetry F7 documents). + assert.Equal(t, key, creds.Key, + "Key must round-trip via the explicit parameter") + assert.Empty(t, creds.ProviderName, + "ProviderName has no DCRResolution counterpart and is intentionally not populated; "+ + "a future contributor threading it through must update this assertion and the converters together") + + roundTripped := credentialsToResolution(creds) + require.NotNil(t, roundTripped) + + t.Run("preserved fields survive round-trip", func(t *testing.T) { + t.Parallel() + assert.Equal(t, original.ClientID, roundTripped.ClientID) + assert.Equal(t, original.ClientSecret, roundTripped.ClientSecret) + assert.Equal(t, original.AuthorizationEndpoint, roundTripped.AuthorizationEndpoint) + assert.Equal(t, original.TokenEndpoint, roundTripped.TokenEndpoint) + assert.Equal(t, original.RegistrationAccessToken, roundTripped.RegistrationAccessToken) + assert.Equal(t, original.RegistrationClientURI, roundTripped.RegistrationClientURI) + assert.Equal(t, original.TokenEndpointAuthMethod, roundTripped.TokenEndpointAuthMethod) + assert.Equal(t, original.ClientSecretExpiresAt, roundTripped.ClientSecretExpiresAt) + assert.Equal(t, original.CreatedAt, roundTripped.CreatedAt) + }) + + t.Run("RedirectURI is recovered from Key on read", func(t *testing.T) { + t.Parallel() + assert.Equal(t, key.RedirectURI, roundTripped.RedirectURI, + "RedirectURI on the round-tripped resolution must match Key.RedirectURI; "+ + "the converter does not store it twice") + }) + + t.Run("dropped fields zero on read", func(t *testing.T) { + t.Parallel() + // ClientIDIssuedAt is informational (RFC 7591 §3.2.1) and not + // persisted; round-tripping must reset it to the zero value + // rather than silently re-deriving it from CreatedAt. + assert.True(t, roundTripped.ClientIDIssuedAt.IsZero(), + "ClientIDIssuedAt must be zero after round-trip; the field is intentionally dropped from DCRCredentials") + }) + + t.Run("nil inputs short-circuit", func(t *testing.T) { + t.Parallel() + assert.Nil(t, resolutionToCredentials(key, nil)) + assert.Nil(t, credentialsToResolution(nil)) + }) +} diff --git a/pkg/authserver/runner/dcr_test.go b/pkg/authserver/runner/dcr_test.go index 5711a6ffb0..ea5f1ebbdc 100644 --- a/pkg/authserver/runner/dcr_test.go +++ b/pkg/authserver/runner/dcr_test.go @@ -140,7 +140,7 @@ func TestResolveDCRCredentials_CacheHitShortCircuits(t *testing.T) { })) t.Cleanup(server.Close) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL // Pre-populate the cache with a resolution matching the key we will @@ -188,7 +188,7 @@ func TestResolveDCRCredentials_RegistersOnCacheMiss(t *testing.T) { }, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid", "profile"}, @@ -228,7 +228,7 @@ func TestResolveDCRCredentials_ExplicitEndpointsOverride(t *testing.T) { t.Parallel() server := newDCRTestServer(t, dcrTestHandlerConfig{}) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ @@ -263,7 +263,7 @@ func TestResolveDCRCredentials_InitialAccessTokenAsBearer(t *testing.T) { tokenPath := filepath.Join(t.TempDir(), "iat") require.NoError(t, os.WriteFile(tokenPath, []byte("iat-secret-value\n"), 0o600)) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -327,7 +327,7 @@ func TestResolveDCRCredentials_DoesNotForwardBearerOnRedirect(t *testing.T) { tokenPath := filepath.Join(t.TempDir(), "iat") require.NoError(t, os.WriteFile(tokenPath, []byte("iat-secret-value\n"), 0o600)) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := upstream.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -396,7 +396,7 @@ func TestResolveDCRCredentials_AuthMethodPreference(t *testing.T) { tokenEndpointAuthMethodsSupported: tc.supported, codeChallengeMethodsSupported: tc.codeChallenge, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -435,7 +435,7 @@ func TestResolveDCRCredentials_RefusesNoneWithoutS256(t *testing.T) { tokenEndpointAuthMethodsSupported: []string{"none"}, codeChallengeMethodsSupported: tc.codeChallenge, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -461,7 +461,7 @@ func TestResolveDCRCredentials_EmptyAuthMethodIntersectionErrors(t *testing.T) { server := newDCRTestServer(t, dcrTestHandlerConfig{ tokenEndpointAuthMethodsSupported: []string{"tls_client_auth"}, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -487,7 +487,7 @@ func TestResolveDCRCredentials_SynthesisedRegistrationEndpoint(t *testing.T) { gotPath = r.URL.Path }, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -520,7 +520,7 @@ func TestResolveDCRCredentials_RegistrationEndpointDirectBypassesDiscovery(t *te server := httptest.NewServer(mux) t.Cleanup(server.Close) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ AuthorizationEndpoint: issuer + "/authorize", @@ -561,28 +561,28 @@ func TestResolveDCRCredentials_RejectsInvalidInputs(t *testing.T) { name: "nil run-config", rc: nil, issuer: "https://example.com", - cache: newInMemoryDCRResolutionCache(), + cache: newMemoryDCRStore(t), wantErrSub: "oauth2 upstream run-config is required", }, { name: "pre-provisioned client_id", rc: &authserver.OAuth2UpstreamRunConfig{ClientID: "preprovisioned", DCRConfig: validCfg}, issuer: "https://example.com", - cache: newInMemoryDCRResolutionCache(), + cache: newMemoryDCRStore(t), wantErrSub: "pre-provisioned", }, { name: "missing dcr_config", rc: &authserver.OAuth2UpstreamRunConfig{}, issuer: "https://example.com", - cache: newInMemoryDCRResolutionCache(), + cache: newMemoryDCRStore(t), wantErrSub: "no dcr_config", }, { name: "empty issuer", rc: &authserver.OAuth2UpstreamRunConfig{DCRConfig: validCfg}, issuer: "", - cache: newInMemoryDCRResolutionCache(), + cache: newMemoryDCRStore(t), wantErrSub: "issuer is required", }, { @@ -782,7 +782,7 @@ func TestResolveDCRCredentials_DiscoveryURLHonoured(t *testing.T) { server = httptest.NewServer(mux) t.Cleanup(server.Close) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -823,7 +823,7 @@ func TestResolveDCRCredentials_DiscoveryURLIssuerMismatchRejected(t *testing.T) server := httptest.NewServer(mux) t.Cleanup(server.Close) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -850,7 +850,7 @@ func TestResolveDCRCredentials_DiscoveredScopesFallback(t *testing.T) { gotBody = body }, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ // Scopes intentionally left empty so the resolver falls back to @@ -882,7 +882,7 @@ func TestResolveDCRCredentials_EmptyScopesOmitted(t *testing.T) { gotBody = body }, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ DCRConfig: &authserver.DCRUpstreamConfig{ @@ -919,7 +919,7 @@ func TestResolveDCRCredentials_UpstreamIssuerDerivedFromDiscoveryURL(t *testing. server := newDCRTestServer(t, dcrTestHandlerConfig{ tokenEndpointAuthMethodsSupported: []string{"client_secret_basic"}, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) // Caller-supplied issuer names this auth server, NOT the upstream. // Production wiring always passes its own issuer here (see @@ -1061,7 +1061,7 @@ func TestResolveDCRCredentials_SingleflightCoalescesConcurrentCallers(t *testing }, }) - cache := &countingStore{inner: newInMemoryDCRResolutionCache()} + cache := &countingStore{inner: newMemoryDCRStore(t)} issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid", "profile"}, @@ -1472,7 +1472,7 @@ func TestResolveDCRCredentials_RefetchesOnExpiredCachedSecret(t *testing.T) { }, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -1527,7 +1527,7 @@ func TestResolveDCRCredentials_HonoursFutureExpiryAndZero(t *testing.T) { atomic.AddInt32(®istrationCalls, 1) }, }) - cache := newInMemoryDCRResolutionCache() + cache := newMemoryDCRStore(t) issuer := server.URL rc := &authserver.OAuth2UpstreamRunConfig{ Scopes: []string{"openid"}, @@ -1678,7 +1678,7 @@ func TestDcrStepError(t *testing.T) { // Precondition failure → dcrStepValidate. _, err := resolveDCRCredentials(context.Background(), nil, "https://as", - newInMemoryDCRResolutionCache()) + newMemoryDCRStore(t)) require.Error(t, err) var stepErr *dcrStepError require.True(t, errors.As(err, &stepErr)) diff --git a/pkg/authserver/runner/dcr_testhelpers_test.go b/pkg/authserver/runner/dcr_testhelpers_test.go new file mode 100644 index 0000000000..b0abdfd8ad --- /dev/null +++ b/pkg/authserver/runner/dcr_testhelpers_test.go @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package runner + +import ( + "testing" + + "github.com/stacklok/toolhive/pkg/authserver/storage" +) + +// newMemoryDCRStore is a test-only convenience constructor wrapping +// storage.NewMemoryStorage in the runner-side adapter. Production deployments +// do NOT reach this constructor — NewEmbeddedAuthServer type-asserts the +// shared authserver storage to storage.DCRCredentialStore and passes it to +// newStorageBackedStore directly. +// +// The caller's *testing.T is required because storage.NewMemoryStorage +// launches a background cleanup goroutine on construction; the helper +// registers t.Cleanup(stor.Close) so each test releases the goroutine when +// it finishes. Without this every test that built a fresh store would leak a +// cleanupLoop goroutine for the duration of the test process. +func newMemoryDCRStore(t *testing.T) dcrResolutionCache { + t.Helper() + stor := storage.NewMemoryStorage() + t.Cleanup(func() { _ = stor.Close() }) + return newStorageBackedStore(stor) +} diff --git a/pkg/authserver/runner/embeddedauthserver.go b/pkg/authserver/runner/embeddedauthserver.go index 3e451b30aa..9d67d06a43 100644 --- a/pkg/authserver/runner/embeddedauthserver.go +++ b/pkg/authserver/runner/embeddedauthserver.go @@ -36,30 +36,16 @@ const ( // EmbeddedAuthServer wraps the authorization server for integration with the proxy runner. // It handles configuration transformation from authserver.RunConfig to authserver.Config, // manages resource lifecycle, and provides HTTP handlers for OAuth/OIDC endpoints. +// +// The DCR credential store is owned by the underlying authserver.Server and +// reached via DCRStore(); see that accessor's doc for SECURITY and lifecycle +// notes. Storing it twice on this struct would create a drift window with +// the server's copy, so we delegate through e.server.DCRStore() instead. type EmbeddedAuthServer struct { server authserver.Server keyProvider keys.KeyProvider - // dcrStore caches RFC 7591 Dynamic Client Registration resolutions across - // calls to buildUpstreamConfigs so that re-entrant boot/reload paths reuse - // previously-registered upstream clients instead of re-registering. - // - // Lifetime: per-instance — the store is owned by this EmbeddedAuthServer - // and is GC'd when the server is unreferenced. This intentionally - // contrasts with the package-level dcrFlight singleflight in dcr.go, - // which is process-wide so concurrent EmbeddedAuthServer instances - // targeting the same upstream still deduplicate the network call. The - // cache (per-instance) and the flight (process-wide) protect different - // resources: the cache prevents redundant registrations across reboots, - // the flight prevents N concurrent /register calls during a thundering - // herd. The asymmetry is by design. - // - // dcrResolutionCache has no Close method today because the in-memory - // implementation needs no release. A future Phase 3 backend (Redis, - // sqlite) with handles will need Close added to the interface and - // invoked from EmbeddedAuthServer.Close. - dcrStore dcrResolutionCache - closeOnce sync.Once - closeErr error + closeOnce sync.Once + closeErr error } // NewEmbeddedAuthServer creates an EmbeddedAuthServer from authserver.RunConfig. @@ -74,6 +60,62 @@ func NewEmbeddedAuthServer(ctx context.Context, cfg *authserver.RunConfig) (*Emb return nil, fmt.Errorf("config is required") } + // Create the storage backend FIRST so the DCR resolver and the auth + // server share the same persistence. Both MemoryStorage and RedisStorage + // satisfy storage.DCRCredentialStore (verified by package-level var _ + // checks in pkg/authserver/storage), so an explicit type assertion at + // the boundary is provably safe and keeps the wider Storage interface + // from advertising secret-bearing DCR methods to every consumer. This + // is the wiring change that lets a Redis-backed authserver reuse RFC + // 7591 client registrations across replicas and restarts. + stor, err := createStorage(ctx, cfg.Storage) + if err != nil { + return nil, fmt.Errorf("failed to create storage: %w", err) + } + return newEmbeddedAuthServerWithStorage(ctx, cfg, stor) +} + +// newEmbeddedAuthServerWithStorage is the unexported core constructor that +// builds an EmbeddedAuthServer around a caller-supplied storage backend. +// NewEmbeddedAuthServer dispatches into this helper after running +// createStorage; tests dispatch into it directly so they can supply a +// closeTrackingStorage wrapper to verify the deferred-cleanup contract. +// +// Resource ownership: on success, the returned EmbeddedAuthServer takes +// ownership of stor (its Close releases the backend). On any error path +// after entry, the deferred cleanup closes stor before returning so a +// crash-looping caller (typical when DCR's network I/O fails) does not +// leak the Redis client connection pool / MemoryStorage cleanup goroutine +// on every restart. The named return retErr is the gate. +func newEmbeddedAuthServerWithStorage( + ctx context.Context, + cfg *authserver.RunConfig, + stor storage.Storage, +) (retEAS *EmbeddedAuthServer, retErr error) { + // From here on, any error must close stor before returning. + // + // Both errors are passed through sanitizeErrorForLog before being + // recorded: closeErr for symmetry with retErr, retErr because the + // most common cause of reaching this gate is a wrapped DCR failure + // whose error chain may inline several KiB of the upstream's raw + // /register response body — that body is attacker-influenced and may + // contain URL components that carry credentials (userinfo, query, + // fragment). The existing logDCRStepError boundary log routes + // through the same sanitiser; keep the two log paths consistent so + // the cleanup log cannot regress to a less-defended state. The + // "cause" key matches the package-wide vocabulary for the + // triggering error. + defer func() { + if retErr != nil { + if closeErr := stor.Close(); closeErr != nil { + slog.Warn("failed to close storage on NewEmbeddedAuthServer error path", + "error", sanitizeErrorForLog(closeErr), + "cause", sanitizeErrorForLog(retErr), + ) + } + } + }() + // 1. Create key provider from RunConfig.SigningKeyConfig keyProvider, err := createKeyProvider(cfg.SigningKeyConfig) if err != nil { @@ -92,15 +134,25 @@ func NewEmbeddedAuthServer(ctx context.Context, cfg *authserver.RunConfig) (*Emb return nil, fmt.Errorf("failed to parse token lifespans: %w", err) } - // 4. Build upstream configurations (resolves DCR credentials for any - // upstream configured with DCRConfig, caching resolutions in dcrStore). - dcrStore := newInMemoryDCRResolutionCache() - upstreams, err := buildUpstreamConfigs(ctx, cfg.Upstreams, cfg.Issuer, dcrStore) + // 4. Type-assert to the DCR-capable handle for the resolver. The + // per-backend `var _ DCRCredentialStore = (*MemoryStorage)(nil)` / + // `(*RedisStorage)(nil)` checks make this provably safe for production + // backends; surfacing a non-DCR backend as a constructor error keeps + // misconfiguration fail-loud at boot rather than at first DCR resolve. + dcrStore, ok := stor.(storage.DCRCredentialStore) + if !ok { + return nil, fmt.Errorf("storage backend %T does not implement storage.DCRCredentialStore", stor) + } + + // 5. Build upstream configurations. The DCR resolver caches RFC 7591 + // resolutions in dcrStore so re-entrant boot/reload paths reuse + // previously-registered upstream clients instead of re-registering. + upstreams, err := buildUpstreamConfigs(ctx, cfg.Upstreams, cfg.Issuer, newStorageBackedStore(dcrStore)) if err != nil { return nil, fmt.Errorf("failed to build upstream configs: %w", err) } - // 5. Build the resolved Config + // 6. Build the resolved Config resolvedCfg := authserver.Config{ Issuer: cfg.Issuer, AuthorizationEndpointBaseURL: cfg.AuthorizationEndpointBaseURL, @@ -114,13 +166,9 @@ func NewEmbeddedAuthServer(ctx context.Context, cfg *authserver.RunConfig) (*Emb AllowedAudiences: cfg.AllowedAudiences, } - // 6. Create storage backend based on configuration - stor, err := createStorage(ctx, cfg.Storage) - if err != nil { - return nil, fmt.Errorf("failed to create storage: %w", err) - } - - // 7. Create the auth server + // 7. Create the auth server. authserver.New also asserts the DCR + // capability internally so its DCRStore() accessor returns the same + // asserted handle this constructor used for buildUpstreamConfigs. server, err := authserver.New(ctx, resolvedCfg, stor) if err != nil { return nil, fmt.Errorf("failed to create auth server: %w", err) @@ -129,7 +177,6 @@ func NewEmbeddedAuthServer(ctx context.Context, cfg *authserver.RunConfig) (*Emb return &EmbeddedAuthServer{ server: server, keyProvider: keyProvider, - dcrStore: dcrStore, }, nil } @@ -173,6 +220,16 @@ func (e *EmbeddedAuthServer) KeyProvider() keys.KeyProvider { return e.keyProvider } +// DCRStore returns the persistent DCR credential store the authorization +// server is wired against. This delegates to the underlying authserver.Server +// so this struct does not hold a redundant copy that could drift if the +// server ever swaps backends. See authserver.Server.DCRStore for SECURITY +// and lifecycle notes — the returned interface surfaces raw client_secret +// and registration_access_token values and MUST NOT be logged or rendered. +func (e *EmbeddedAuthServer) DCRStore() storage.DCRCredentialStore { + return e.server.DCRStore() +} + // Routes returns the authorization server's HTTP route map. // // The /.well-known/ paths are registered explicitly because that namespace is shared: diff --git a/pkg/authserver/runner/embeddedauthserver_test.go b/pkg/authserver/runner/embeddedauthserver_test.go index 55c001cef2..925200c8f0 100644 --- a/pkg/authserver/runner/embeddedauthserver_test.go +++ b/pkg/authserver/runner/embeddedauthserver_test.go @@ -4,6 +4,7 @@ package runner import ( + "bytes" "context" "crypto/ecdsa" "crypto/elliptic" @@ -13,6 +14,7 @@ import ( "encoding/pem" "fmt" "io" + "log/slog" "net/http" "net/http/httptest" "os" @@ -1356,6 +1358,7 @@ type stubServer struct { func (s *stubServer) Handler() http.Handler { return s.handler } func (*stubServer) IDPTokenStorage() storage.UpstreamTokenStorage { return nil } func (*stubServer) UpstreamTokenRefresher() storage.UpstreamTokenRefresher { return nil } +func (*stubServer) DCRStore() storage.DCRCredentialStore { return nil } func (*stubServer) Close() error { return nil } func TestRoutes(t *testing.T) { @@ -1567,7 +1570,7 @@ func TestBuildUpstreamConfigs_DCR(t *testing.T) { AllowedAudiences: []string{"https://mcp.example.com"}, } - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) got, err := buildUpstreamConfigs(context.Background(), cfg.Upstreams, cfg.Issuer, store) require.NoError(t, err) require.Len(t, got, 1) @@ -1629,7 +1632,7 @@ func TestBuildUpstreamConfigs_DCR(t *testing.T) { AllowedAudiences: []string{"https://mcp.example.com"}, } - store := newInMemoryDCRResolutionCache() + store := newMemoryDCRStore(t) // First call: populates the store. _, err := buildUpstreamConfigs(context.Background(), cfg.Upstreams, cfg.Issuer, store) @@ -1704,24 +1707,28 @@ func TestNewEmbeddedAuthServer_DCRBoot(t *testing.T) { require.NotNil(t, embed) t.Cleanup(func() { _ = embed.Close() }) - // The constructor must have populated a non-nil dcrStore. - require.NotNil(t, embed.dcrStore, "NewEmbeddedAuthServer must initialise a dcrStore") + // The constructor must have wired a non-nil DCR store. + dcrStore := embed.DCRStore() + require.NotNil(t, dcrStore, "NewEmbeddedAuthServer must wire a DCR store") // The DCR registration must have hit the mock AS at least once. assert.Greater(t, atomic.LoadInt32(requestCount), int32(0), "DCR boot should have issued network I/O to the mock AS") // The store on the EmbeddedAuthServer contains the canonical DCRKey - // for this upstream — no separate in-memory store was created. + // for this upstream — the accessor delegates to the same + // storage.DCRCredentialStore createStorage produced, so a successful + // boot persisted the resolution there directly (no separate in-memory + // store was created). redirectURI := server.URL + "/oauth/callback" key := DCRKey{ Issuer: server.URL, RedirectURI: redirectURI, ScopesHash: storage.ScopesHash([]string{"openid", "profile"}), } - cached, ok, err := embed.dcrStore.Get(context.Background(), key) - require.NoError(t, err) - require.True(t, ok, "dcrStore on EmbeddedAuthServer must hold the DCR resolution") + cached, err := dcrStore.GetDCRCredentials(context.Background(), key) + require.NoError(t, err, "DCR store on EmbeddedAuthServer must hold the DCR resolution") + require.NotNil(t, cached) assert.Equal(t, "dcr-client-id", cached.ClientID) assert.Equal(t, "dcr-client-secret", cached.ClientSecret) @@ -1733,3 +1740,301 @@ func TestNewEmbeddedAuthServer_DCRBoot(t *testing.T) { assert.Same(t, originalOAuth2, cfg.Upstreams[0].OAuth2Config, "NewEmbeddedAuthServer must not replace the caller's OAuth2Config pointer") } + +// closeTrackingStorage wraps an authserver storage and counts Close calls. +// It implements storage.Storage by embedding the wrapped value, then +// overriding Close to record the call before delegating to the inner +// Storage. Used by TestNewEmbeddedAuthServer_ClosesStorageOnError to +// verify the deferred-cleanup contract on NewEmbeddedAuthServer's error +// paths without depending on goroutine-count heuristics (which are +// confounded by HTTP transport keep-alive goroutines this package does +// not own). +type closeTrackingStorage struct { + storage.Storage + closeCount atomic.Int32 +} + +func (s *closeTrackingStorage) Close() error { + s.closeCount.Add(1) + return s.Storage.Close() +} + +// TestNewEmbeddedAuthServer_ClosesStorageOnError pins the post-#5185 +// invariant that NewEmbeddedAuthServer never leaks the storage backend on +// the constructor's error paths. +// +// Before the wiring change, createStorage ran late in the constructor so +// most error paths (DCR resolver, upstream config build) returned without +// having opened the backend. After the change, createStorage runs first — +// so a DCR failure (the most likely failure mode in production: upstream +// AS unreachable, /register 4xx) returns from the constructor with the +// storage still holding OS-level resources (Redis client connection pool, +// MemoryStorage cleanup goroutine). Without the deferred cleanup, a +// crash-looping pod would leak one connection pool / goroutine per +// restart. +// +// The test calls newEmbeddedAuthServerWithStorage (the test seam that +// production NewEmbeddedAuthServer dispatches into) so the storage +// instance is observable: a closeTrackingStorage wrapper records every +// Close call. The assertion is then a direct count rather than a +// goroutine-count heuristic. This avoids the package-level swap-pattern +// that would otherwise force the test to run non-parallel. +func TestNewEmbeddedAuthServer_ClosesStorageOnError(t *testing.T) { + t.Parallel() + + tracker := &closeTrackingStorage{Storage: storage.NewMemoryStorage()} + + // Always-500 discovery endpoint forces the DCR resolver to fail + // during buildUpstreamConfigs — i.e. inside the leak window between + // createStorage success and EmbeddedAuthServer construction. + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(server.Close) + + cfg := &authserver.RunConfig{ + SchemaVersion: authserver.CurrentSchemaVersion, + Issuer: server.URL, + Upstreams: []authserver.UpstreamRunConfig{ + { + Name: "dcr-upstream", + Type: authserver.UpstreamProviderTypeOAuth2, + OAuth2Config: &authserver.OAuth2UpstreamRunConfig{ + ClientID: "", + AuthorizationEndpoint: server.URL + "/authorize", + TokenEndpoint: server.URL + "/token", + Scopes: []string{"openid", "profile"}, + DCRConfig: &authserver.DCRUpstreamConfig{ + DiscoveryURL: server.URL + "/.well-known/oauth-authorization-server", + }, + }, + }, + }, + AllowedAudiences: []string{"https://mcp.example.com"}, + } + + embed, err := newEmbeddedAuthServerWithStorage(context.Background(), cfg, tracker) + require.Error(t, err, + "discovery returns 500, so DCR resolution must fail and the constructor must return an error") + assert.Nil(t, embed, + "failed constructor must return nil EmbeddedAuthServer") + assert.Equal(t, int32(1), tracker.closeCount.Load(), + "failed NewEmbeddedAuthServer must Close the storage exactly once via the deferred-cleanup gate; "+ + "a count of 0 indicates the deferred Close did not run, leaking the backend on the error path") +} + +// TestEmbeddedAuthServer_DCRStorePersistsAcrossClose verifies that the DCR +// store reachable through EmbeddedAuthServer.DCRStore() holds the resolved +// RFC 7591 client registration after the constructor's full DCR resolver +// runs against a mock AS. The Get is issued BEFORE Close so the assertion +// does not depend on the (undocumented) MemoryStorage post-Close +// readability that an earlier version of this test silently relied on. +// +// What this test does cover: +// +// - NewEmbeddedAuthServer runs the full DCR resolver against a mock AS +// during construction, populating the storage-backed DCR store, and +// surfaces the same storage.DCRCredentialStore the authserver itself +// reads from via DCRStore(). The persisted credentials are readable +// by issuing a Get against the captured store while the server is +// still live. +// +// What this test does NOT cover (deferred follow-up): +// +// - The full "boot, close, boot again on the same backend, observe zero +// /register calls on the second boot" cross-restart scenario. Closing +// that gap requires either miniredis-Sentinel emulation or a +// Docker-based Redis Sentinel cluster in the test harness, since the +// production restart path lives on Redis (Memory cannot be shared +// across two NewEmbeddedAuthServer constructors). Tracked as a +// follow-up; this test deliberately scopes itself to what is +// exercisable today against the production constructor seam. +func TestEmbeddedAuthServer_DCRStorePersistsAcrossClose(t *testing.T) { + t.Parallel() + + server, requestCount := newMockAuthorizationServer(t) + + cfg := &authserver.RunConfig{ + SchemaVersion: authserver.CurrentSchemaVersion, + Issuer: server.URL, + Upstreams: []authserver.UpstreamRunConfig{ + { + Name: "dcr-upstream", + Type: authserver.UpstreamProviderTypeOAuth2, + OAuth2Config: &authserver.OAuth2UpstreamRunConfig{ + ClientID: "", + AuthorizationEndpoint: server.URL + "/authorize", + TokenEndpoint: server.URL + "/token", + Scopes: []string{"openid", "profile"}, + DCRConfig: &authserver.DCRUpstreamConfig{ + DiscoveryURL: server.URL + "/.well-known/oauth-authorization-server", + }, + }, + }, + }, + AllowedAudiences: []string{"https://mcp.example.com"}, + } + + embed, err := NewEmbeddedAuthServer(context.Background(), cfg) + require.NoError(t, err) + require.NotNil(t, embed) + t.Cleanup(func() { _ = embed.Close() }) + + firstBootRequests := atomic.LoadInt32(requestCount) + require.Greater(t, firstBootRequests, int32(0), + "first boot must have issued network I/O to the mock AS during DCR") + + // Capture the storage instance the constructor wired into the DCR + // store. This is the same backend the authserver itself was using; in + // production it is shared across authserver state, so DCR survives + // restart on the same backend. + persistentStore := embed.DCRStore() + require.NotNil(t, persistentStore, + "NewEmbeddedAuthServer must surface a storage-level DCRCredentialStore") + + // Verify the persisted DCR row by issuing a Get against the captured + // store BEFORE closing the server. Doing the Get pre-Close avoids + // silently depending on whichever storage backend the test happens to + // use staying readable after Close (a contract MemoryStorage honors + // today but RedisStorage's closed connection pool does not). The + // assertion proves the persistence boundary the production cross- + // replica and cross-restart reuse paths depend on: that the + // resolution lives in storage, not in process-local cache state. + redirectURI := server.URL + "/oauth/callback" + key := DCRKey{ + Issuer: server.URL, + RedirectURI: redirectURI, + ScopesHash: storage.ScopesHash([]string{"openid", "profile"}), + } + creds, err := persistentStore.GetDCRCredentials(context.Background(), key) + require.NoError(t, err, + "DCR credentials must be readable from the captured store — "+ + "this is the persistence boundary cross-replica reuse depends on") + require.NotNil(t, creds) + assert.Equal(t, "dcr-client-id", creds.ClientID, + "persisted ClientID must match the first boot's DCR resolution") + assert.Equal(t, "dcr-client-secret", creds.ClientSecret, + "persisted ClientSecret must match the first boot's DCR resolution") + + // Mock-AS request count is unchanged after the survival check — the + // Get is a pure store read with no upstream traffic. + assert.Equal(t, firstBootRequests, atomic.LoadInt32(requestCount), + "GetDCRCredentials must not issue any HTTP requests to the mock AS") +} + +// urlErrorOnCloseStorage wraps an authserver storage and returns a fixed +// URL-bearing error from Close. It exists so +// TestNewEmbeddedAuthServer_DeferredCleanupSanitizesLog can verify that the +// deferred-cleanup gate routes both closeErr and retErr through +// sanitizeErrorForLog, scrubbing any query / userinfo / fragment that might +// carry credentials in a future regression. +type urlErrorOnCloseStorage struct { + storage.Storage + closeErr error +} + +func (s *urlErrorOnCloseStorage) Close() error { + // Intentionally drop the inner Close result: this test is about the log + // path, not about double-closing the inner storage. Returning the + // fixed error makes the slog capture deterministic. + _ = s.Storage.Close() + return s.closeErr +} + +// TestNewEmbeddedAuthServer_DeferredCleanupSanitizesLog pins the post-#5196 +// invariant that the deferred-cleanup slog.Warn at the top of +// newEmbeddedAuthServerWithStorage routes both closeErr and retErr through +// sanitizeErrorForLog, so a future regression that drops the call (or that +// changes the error chain to inline an upstream response body containing a +// userinfo/query/fragment) cannot silently leak secrets to operator logs. +// +// The test injects a closeErr containing a URL with a secret-bearing query +// (?token=leak-marker) and a discovery URL whose host appears verbatim in +// the wrapped DCR error chain (so the captured slog record's `cause` field +// also exercises the sanitiser). It then asserts: +// - the captured log record does NOT contain the literal secret marker; +// - the captured log record DOES contain the host components, so +// operators retain enough context to correlate the failure. +// +// NOT t.Parallel(): the test swaps slog.Default() to capture output and +// restores it via t.Cleanup. Running in parallel would race with any other +// test in this package that emits a log record. Confirmed against the +// paralleltest rule on a sample run — every other test failed with a +// data-race report on slog's internal default-logger handle. +// +//nolint:paralleltest // see comment above; mutates the package-global slog.Default() +func TestNewEmbeddedAuthServer_DeferredCleanupSanitizesLog(t *testing.T) { + const ( + closeErrSecretMarker = "close-leak-marker-7f9c" + retErrSecretMarker = "ret-leak-marker-3b2a" + ) + + closeErr := fmt.Errorf( + "redis: connection broken: https://primary:hidden@redis.example.com/0?token=%s", + closeErrSecretMarker, + ) + tracker := &urlErrorOnCloseStorage{ + Storage: storage.NewMemoryStorage(), + closeErr: closeErr, + } + + // Discovery endpoint returns 500 so the DCR resolver fails and the + // constructor reaches the deferred-cleanup gate. The query string + // embedded in the discovery URL is what the resolver wraps into its + // error chain via fmt.Errorf("...%w", err) — so retErr's text is the + // vehicle for the second secret marker. + httpServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + t.Cleanup(httpServer.Close) + + discoveryURL := httpServer.URL + "/.well-known/oauth-authorization-server?token=" + retErrSecretMarker + + cfg := &authserver.RunConfig{ + SchemaVersion: authserver.CurrentSchemaVersion, + Issuer: httpServer.URL, + Upstreams: []authserver.UpstreamRunConfig{ + { + Name: "dcr-upstream", + Type: authserver.UpstreamProviderTypeOAuth2, + OAuth2Config: &authserver.OAuth2UpstreamRunConfig{ + ClientID: "", + AuthorizationEndpoint: httpServer.URL + "/authorize", + TokenEndpoint: httpServer.URL + "/token", + Scopes: []string{"openid", "profile"}, + DCRConfig: &authserver.DCRUpstreamConfig{ + DiscoveryURL: discoveryURL, + }, + }, + }, + }, + AllowedAudiences: []string{"https://mcp.example.com"}, + } + + // Capture slog output by swapping the default logger for the duration + // of this test. Restore on cleanup so parallel tests are unaffected. + var buf bytes.Buffer + prev := slog.Default() + slog.SetDefault(slog.New(slog.NewTextHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug}))) + t.Cleanup(func() { slog.SetDefault(prev) }) + + embed, err := newEmbeddedAuthServerWithStorage(context.Background(), cfg, tracker) + require.Error(t, err) + assert.Nil(t, embed) + + logged := buf.String() + + // Defense in depth: both secret markers must be stripped before the + // Warn record reaches operator logs. + assert.NotContains(t, logged, closeErrSecretMarker, + "closeErr secret marker must be sanitised before reaching the Warn record") + assert.NotContains(t, logged, retErrSecretMarker, + "retErr secret marker must be sanitised before reaching the Warn record") + assert.NotContains(t, logged, "primary:hidden", + "closeErr userinfo must be sanitised before reaching the Warn record") + + // The host components survive sanitisation so operators retain enough + // context to correlate the failure with upstream logs. + assert.Contains(t, logged, "redis.example.com", + "closeErr host must remain in the Warn record after sanitisation") +} diff --git a/pkg/authserver/server.go b/pkg/authserver/server.go index aef969fec8..d06e2331f5 100644 --- a/pkg/authserver/server.go +++ b/pkg/authserver/server.go @@ -36,6 +36,23 @@ type Server interface { // Returns nil if no upstream IDP is configured. UpstreamTokenRefresher() storage.UpstreamTokenRefresher + // DCRStore returns the persistent DCR credential store the server is wired + // against. This is the same DCRCredentialStore used by the upstream-DCR + // resolver at boot, so callers can read RFC 7591 client registrations + // without bypassing the storage backend the server itself reads from. + // + // SECURITY: the returned interface surfaces raw `client_secret` and + // `registration_access_token` values. Callers MUST NOT log or render the + // returned values; treat the handle the same way you would treat a + // secrets manager client. Intended for admin / diagnostic code paths and + // integration tests, not for general consumers. + // + // Lifecycle: the returned handle's lifetime is bound to Server.Close — + // methods invoked after Close have backend-specific behavior (a + // MemoryStorage continues to serve reads; a RedisStorage will error on + // its closed connection pool). + DCRStore() storage.DCRCredentialStore + // Close releases resources held by the server. Close() error } diff --git a/pkg/authserver/server_impl.go b/pkg/authserver/server_impl.go index 7faaeba066..55fcde584a 100644 --- a/pkg/authserver/server_impl.go +++ b/pkg/authserver/server_impl.go @@ -22,8 +22,14 @@ import ( // server is the internal implementation of the Server interface. type server struct { - handler http.Handler - storage storage.Storage + handler http.Handler + storage storage.Storage + // dcrStore is the same storage.Storage value asserted to + // storage.DCRCredentialStore. The assertion runs once at construction + // (newServer) so DCRStore() is a field read rather than re-asserting on + // every call, and a backend that does not implement DCRCredentialStore + // is rejected at boot rather than at first DCR resolve. + dcrStore storage.DCRCredentialStore upstreams []handlers.NamedUpstream // refreshTokenLifespan mirrors the validated Config.RefreshTokenLifespan. // It is threaded into upstreamTokenRefresher so the refresh path can @@ -93,6 +99,19 @@ func newServer(ctx context.Context, cfg Config, stor storage.Storage, opts ...se return nil, fmt.Errorf("storage is required") } + // Storage no longer embeds DCRCredentialStore (the embed widened secret + // reach to every Storage consumer); obtain the DCR-capable handle via an + // explicit assertion at the boundary. The per-backend + // `var _ DCRCredentialStore = (*MemoryStorage)(nil)` / + // `var _ DCRCredentialStore = (*RedisStorage)(nil)` checks make this + // provably safe for the production backends; surfacing a bad backend as + // a constructor error keeps misconfiguration fail-loud at boot rather + // than at first DCR resolve. + dcrStore, ok := stor.(storage.DCRCredentialStore) + if !ok { + return nil, fmt.Errorf("storage backend %T does not implement storage.DCRCredentialStore", stor) + } + slog.Debug("creating OAuth2 configuration") // Get signing key from KeyProvider @@ -172,6 +191,7 @@ func newServer(ctx context.Context, cfg Config, stor storage.Storage, opts ...se return &server{ handler: router, storage: stor, + dcrStore: dcrStore, upstreams: upstreams, refreshTokenLifespan: cfg.RefreshTokenLifespan, }, nil @@ -187,6 +207,12 @@ func (s *server) IDPTokenStorage() storage.UpstreamTokenStorage { return s.storage } +// DCRStore returns the persistent DCR credential store the server is wired +// against. See the Server interface doc for SECURITY and lifecycle notes. +func (s *server) DCRStore() storage.DCRCredentialStore { + return s.dcrStore +} + // UpstreamTokenRefresher returns a refresher that wraps the upstream providers // and storage to transparently refresh expired upstream tokens. The refresher // dispatches to the correct provider based on each token's ProviderID. diff --git a/pkg/authserver/server_test.go b/pkg/authserver/server_test.go index 121082f54f..d530239950 100644 --- a/pkg/authserver/server_test.go +++ b/pkg/authserver/server_test.go @@ -13,6 +13,7 @@ import ( servercrypto "github.com/stacklok/toolhive/pkg/authserver/server/crypto" "github.com/stacklok/toolhive/pkg/authserver/server/keys" + "github.com/stacklok/toolhive/pkg/authserver/storage" storagemocks "github.com/stacklok/toolhive/pkg/authserver/storage/mocks" "github.com/stacklok/toolhive/pkg/authserver/upstream" upstreammocks "github.com/stacklok/toolhive/pkg/authserver/upstream/mocks" @@ -145,10 +146,17 @@ func TestNewServer_Success(t *testing.T) { ctrl := gomock.NewController(t) defer ctrl.Finish() - // Create mocks - mockStorage := storagemocks.NewMockStorage(ctrl) mockUpstream := upstreammocks.NewMockOAuth2Provider(ctrl) + // Use a real MemoryStorage rather than storagemocks.MockStorage: the + // constructor type-asserts the storage to storage.DCRCredentialStore (per + // the F6 design — Storage no longer embeds DCRCredentialStore), and + // generated MockStorage does not implement DCRCredentialStore. This test + // exercises the constructor flow, not specific storage method calls, so + // a real MemoryStorage is sufficient and keeps the assertion path real. + stor := storage.NewMemoryStorage() + t.Cleanup(func() { _ = stor.Close() }) + // Create valid config cfg := Config{ Issuer: "https://example.com", @@ -165,7 +173,7 @@ func TestNewServer_Success(t *testing.T) { // Call newServer with the mock factory ctx := context.Background() - srv, err := newServer(ctx, cfg, mockStorage, withUpstreamFactory(mockFactory)) + srv, err := newServer(ctx, cfg, stor, withUpstreamFactory(mockFactory)) if err != nil { t.Fatalf("newServer() unexpected error: %v", err) @@ -176,7 +184,7 @@ func TestNewServer_Success(t *testing.T) { if srv.Handler() == nil { t.Error("server.Handler() returned nil") } - if srv.IDPTokenStorage() != mockStorage { + if srv.IDPTokenStorage() != stor { t.Error("server.IDPTokenStorage() did not return expected storage") } } diff --git a/pkg/authserver/storage/types.go b/pkg/authserver/storage/types.go index 1e70da53d2..80b12b3772 100644 --- a/pkg/authserver/storage/types.go +++ b/pkg/authserver/storage/types.go @@ -229,6 +229,18 @@ func validateDCRCredentialsForStore(creds *DCRCredentials) error { // retains entries for the process lifetime and is intentionally excluded from // the periodic cleanup loop. The Redis backend applies TTL via SET with a // duration when ClientSecretExpiresAt is non-zero. +// +// # Converter contract +// +// MUST update both converters in +// pkg/authserver/runner/dcr_store.go (resolutionToCredentials and +// credentialsToResolution) when adding, renaming, or removing a field +// here. The two converters are the only translation seam between this +// persisted type and the runner-side *DCRResolution; a field added here +// without a paired converter update will silently fail to round-trip +// across an authserver restart. The round-trip behaviour is pinned by +// TestResolutionCredentialsRoundTrip in +// pkg/authserver/runner/dcr_store_test.go. type DCRCredentials struct { // Key is the canonical cache key: (Issuer, RedirectURI, ScopesHash). Key DCRKey @@ -615,6 +627,19 @@ type UserStorage interface { type Storage interface { // Embed segregated interfaces for IDP tokens, pending authorizations, client registry, // and user management for multi-IDP support. + // + // DCRCredentialStore is intentionally NOT embedded here: doing so would + // promote GetDCRCredentials / StoreDCRCredentials onto every consumer of + // storage.Storage (handlers, server, registration, etc.), broadening the + // surface that can read raw client_secret / registration_access_token even + // when those consumers have no DCR responsibility. Code that legitimately + // needs DCR access (the runner and authserver constructors) obtains it + // via an explicit `stor.(DCRCredentialStore)` type assertion at the + // boundary; the per-backend `var _ DCRCredentialStore = (*MemoryStorage)(nil)` + // and `var _ DCRCredentialStore = (*RedisStorage)(nil)` checks in + // memory.go / redis.go provide the compile-time guarantee that production + // backends satisfy the interface, so the runtime assertion is provably + // safe at the boundary while keeping the wider Storage surface narrow. UpstreamTokenStorage PendingAuthorizationStorage ClientRegistry