diff --git a/maas-api/cmd/main.go b/maas-api/cmd/main.go index 443212289..40c6928f1 100644 --- a/maas-api/cmd/main.go +++ b/maas-api/cmd/main.go @@ -15,6 +15,7 @@ import ( "github.com/gin-contrib/cors" "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/opendatahub-io/models-as-a-service/maas-api/internal/api_keys" "github.com/opendatahub-io/models-as-a-service/maas-api/internal/config" @@ -51,7 +52,7 @@ func serve() error { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - cluster, err := config.NewClusterConfig(cfg.Namespace, cfg.MaaSSubscriptionNamespace, constant.DefaultResyncPeriod) + cluster, err := config.NewClusterConfig(cfg.Namespace, cfg.MaaSSubscriptionNamespace, constant.DefaultResyncPeriod, cfg.SARCacheMaxSize) if err != nil { return fmt.Errorf("failed to create cluster config: %w", err) } @@ -139,6 +140,7 @@ func initStore(ctx context.Context, log *logger.Logger, cfg *config.Config) (api func registerHandlers(ctx context.Context, log *logger.Logger, router *gin.Engine, cfg *config.Config, cluster *config.ClusterConfig, store api_keys.MetadataStore) error { router.GET("/health", handlers.NewHealthHandler().HealthCheck) + router.GET("/metrics", gin.WrapH(promhttp.Handler())) if !cluster.StartAndWaitForSync(ctx.Done()) { return errors.New("failed to sync informer caches") diff --git a/maas-api/go.mod b/maas-api/go.mod index 7d437a170..3af068178 100644 --- a/maas-api/go.mod +++ b/maas-api/go.mod @@ -12,6 +12,8 @@ require ( github.com/kserve/kserve v0.0.0-20251121160314-57d83d202f36 github.com/lib/pq v1.10.9 github.com/openai/openai-go/v2 v2.3.1 + github.com/prometheus/client_golang v1.23.2 + github.com/prometheus/client_model v0.6.2 github.com/stretchr/testify v1.11.1 go.uber.org/zap v1.27.0 golang.org/x/sync v0.19.0 @@ -92,8 +94,6 @@ require ( github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect - github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.17.0 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/maas-api/internal/api_keys/handler.go b/maas-api/internal/api_keys/handler.go index 539622ddc..04897f4ac 100644 --- a/maas-api/internal/api_keys/handler.go +++ b/maas-api/internal/api_keys/handler.go @@ -26,7 +26,7 @@ const ( // The SARAdminChecker implementation uses Kubernetes SubjectAccessReview // to check if the user can create maasauthpolicies (RBAC-based admin detection). type AdminChecker interface { - IsAdmin(ctx context.Context, user *token.UserContext) bool + IsAdmin(ctx context.Context, user *token.UserContext) (bool, error) } type Handler struct { @@ -70,22 +70,19 @@ func (h *Handler) getUserContext(c *gin.Context) *token.UserContext { // isAdmin checks if the user has admin privileges via SubjectAccessReview. // Admin is determined by RBAC: can user create maasauthpolicies in the configured MaaS namespace? // Returns true if the user has admin RBAC permissions, false otherwise. -func (h *Handler) isAdmin(ctx context.Context, user *token.UserContext) bool { +func (h *Handler) isAdmin(ctx context.Context, user *token.UserContext) (bool, error) { if h == nil || user == nil { - return false + return false, nil } return h.adminChecker.IsAdmin(ctx, user) } // isAuthorizedForKey checks if the user is authorized to access the API key. // User is authorized if they own the key or are an admin. -func (h *Handler) isAuthorizedForKey(ctx context.Context, user *token.UserContext, keyOwner string) bool { - // Check if user owns the key +func (h *Handler) isAuthorizedForKey(ctx context.Context, user *token.UserContext, keyOwner string) (bool, error) { if user.Username == keyOwner { - return true + return true, nil } - - // Check if user is admin return h.isAdmin(ctx, user) } @@ -117,7 +114,13 @@ func (h *Handler) GetAPIKey(c *gin.Context) { } // Check authorization - user must own the key or be admin - if !h.isAuthorizedForKey(c.Request.Context(), user, tok.Username) { + authorized, authErr := h.isAuthorizedForKey(c.Request.Context(), user, tok.Username) + if authErr != nil { + h.logger.Error("Failed to check admin status", "error", authErr) + c.JSON(http.StatusNotFound, gin.H{"error": "API key not found"}) + return + } + if !authorized { h.logger.Warn("Unauthorized API key access attempt", "requestingUser", user.Username, "keyOwner", tok.Username, @@ -292,7 +295,13 @@ func (h *Handler) RevokeAPIKey(c *gin.Context) { } // Check authorization - user must own the key or be admin - if !h.isAuthorizedForKey(c.Request.Context(), user, keyMetadata.Username) { + authorized, authErr := h.isAuthorizedForKey(c.Request.Context(), user, keyMetadata.Username) + if authErr != nil { + h.logger.Error("Failed to check admin status", "error", authErr) + c.JSON(http.StatusNotFound, gin.H{"error": "API key not found"}) + return + } + if !authorized { h.logger.Warn("Unauthorized API key revocation attempt", "requestingUser", user.Username, "keyOwner", keyMetadata.Username, @@ -370,7 +379,12 @@ func (h *Handler) SearchAPIKeys(c *gin.Context) { } // Determine target username for filtering - isAdmin := h.isAdmin(c.Request.Context(), user) + isAdmin, adminErr := h.isAdmin(c.Request.Context(), user) + if adminErr != nil { + h.logger.Error("Failed to check admin status", "error", adminErr) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to check authorization"}) + return + } targetUsername := req.Filters.Username if !isAdmin { @@ -483,15 +497,23 @@ func (h *Handler) BulkRevokeAPIKeys(c *gin.Context) { } // Authorization: users can revoke own keys, admins can revoke any user's keys - if req.Username != user.Username && !h.isAdmin(c.Request.Context(), user) { - h.logger.Warn("Unauthorized bulk revoke attempt", - "requestingUser", user.Username, - "targetUser", req.Username, - ) - c.JSON(http.StatusForbidden, gin.H{ - "error": "Access denied: you can only bulk revoke your own API keys", - }) - return + if req.Username != user.Username { + isAdmin, adminErr := h.isAdmin(c.Request.Context(), user) + if adminErr != nil { + h.logger.Error("Failed to check admin status", "error", adminErr) + c.JSON(http.StatusInternalServerError, gin.H{"error": "Failed to check authorization"}) + return + } + if !isAdmin { + h.logger.Warn("Unauthorized bulk revoke attempt", + "requestingUser", user.Username, + "targetUser", req.Username, + ) + c.JSON(http.StatusForbidden, gin.H{ + "error": "Access denied: you can only bulk revoke your own API keys", + }) + return + } } // Perform bulk revocation diff --git a/maas-api/internal/api_keys/handler_test.go b/maas-api/internal/api_keys/handler_test.go index 91f741ada..2fd7cf0b9 100644 --- a/maas-api/internal/api_keys/handler_test.go +++ b/maas-api/internal/api_keys/handler_test.go @@ -75,16 +75,16 @@ func newMockAdminChecker() *mockAdminChecker { } } -func (m *mockAdminChecker) IsAdmin(_ context.Context, user *token.UserContext) bool { +func (m *mockAdminChecker) IsAdmin(_ context.Context, user *token.UserContext) (bool, error) { if user == nil { - return false + return false, nil } for _, userGroup := range user.Groups { if slices.Contains(m.adminGroups, userGroup) { - return true + return true, nil } } - return false + return false, nil } // executeSearchRequest is a test helper that executes a search request and returns the parsed response. @@ -114,17 +114,23 @@ func TestIsAuthorizedForKey(t *testing.T) { t.Run("OwnerCanAccess", func(t *testing.T) { user := &token.UserContext{Username: "alice", Groups: []string{"users"}} - assert.True(t, h.isAuthorizedForKey(ctx, user, "alice")) + result, err := h.isAuthorizedForKey(ctx, user, "alice") + require.NoError(t, err) + assert.True(t, result) }) t.Run("NonOwnerCannotAccess", func(t *testing.T) { user := &token.UserContext{Username: "bob", Groups: []string{"users"}} - assert.False(t, h.isAuthorizedForKey(ctx, user, "alice")) + result, err := h.isAuthorizedForKey(ctx, user, "alice") + require.NoError(t, err) + assert.False(t, result) }) t.Run("AdminCanAccessAnyKey", func(t *testing.T) { admin := &token.UserContext{Username: "admin", Groups: []string{"admin-users"}} - assert.True(t, h.isAuthorizedForKey(ctx, admin, "alice")) + result, err := h.isAuthorizedForKey(ctx, admin, "alice") + require.NoError(t, err) + assert.True(t, result) }) } diff --git a/maas-api/internal/auth/cached_admin_checker.go b/maas-api/internal/auth/cached_admin_checker.go new file mode 100644 index 000000000..600f88f3d --- /dev/null +++ b/maas-api/internal/auth/cached_admin_checker.go @@ -0,0 +1,162 @@ +package auth + +import ( + "context" + "errors" + "slices" + "strings" + "sync" + "time" + + "github.com/prometheus/client_golang/prometheus" + "k8s.io/utils/clock" + + "github.com/opendatahub-io/models-as-a-service/maas-api/internal/token" +) + +type adminChecker interface { + IsAdmin(ctx context.Context, user *token.UserContext) (bool, error) +} + +type cacheEntry struct { + isAdmin bool + expiresAt time.Time +} + +type CachedAdminChecker struct { + delegate adminChecker + ttl time.Duration + negativeTTL time.Duration + maxSize int + clock clock.Clock + + mu sync.RWMutex + cache map[string]cacheEntry + + hits prometheus.Counter + misses prometheus.Counter +} + +func NewCachedAdminChecker(delegate adminChecker, ttl time.Duration, negativeTTL time.Duration, maxSize int, reg prometheus.Registerer, clk clock.Clock) *CachedAdminChecker { + if delegate == nil { + panic("delegate cannot be nil for CachedAdminChecker") + } + if ttl <= 0 { + panic("ttl must be positive for CachedAdminChecker") + } + if negativeTTL <= 0 { + panic("negativeTTL must be positive for CachedAdminChecker") + } + if maxSize <= 0 { + panic("maxSize must be positive for CachedAdminChecker") + } + if reg == nil { + reg = prometheus.DefaultRegisterer + } + if clk == nil { + clk = clock.RealClock{} + } + + return &CachedAdminChecker{ + delegate: delegate, + ttl: ttl, + negativeTTL: negativeTTL, + maxSize: maxSize, + clock: clk, + cache: make(map[string]cacheEntry), + hits: registerOrReuseCounter(reg, prometheus.NewCounter(prometheus.CounterOpts{ + Name: "sar_cache_hits_total", + Help: "Total number of SAR admin check cache hits.", + })), + misses: registerOrReuseCounter(reg, prometheus.NewCounter(prometheus.CounterOpts{ + Name: "sar_cache_misses_total", + Help: "Total number of SAR admin check cache misses.", + })), + } +} + +func (c *CachedAdminChecker) IsAdmin(ctx context.Context, user *token.UserContext) (bool, error) { + if c == nil || user == nil || user.Username == "" { + return false, nil + } + if ctx.Err() != nil { + return false, ctx.Err() + } + + key := cacheKey(user) + now := c.clock.Now() + + c.mu.RLock() + entry, ok := c.cache[key] + c.mu.RUnlock() + + if ok && now.Before(entry.expiresAt) { + c.hits.Inc() + return entry.isAdmin, nil + } + + result, err := c.delegate.IsAdmin(ctx, user) + + if err != nil || ctx.Err() != nil { + c.misses.Inc() + if err != nil { + return false, err + } + return false, ctx.Err() + } + + ttl := c.ttl + if !result { + ttl = c.negativeTTL + } + + c.mu.Lock() + c.evictExpiredLocked(now) + if len(c.cache) < c.maxSize { + c.cache[key] = cacheEntry{ + isAdmin: result, + expiresAt: now.Add(ttl), + } + } + c.mu.Unlock() + + c.misses.Inc() + return result, nil +} + +//nolint:ireturn,nonamedreturns // Prometheus counters are inherently interface-typed; named returns clarify which is which. +func (c *CachedAdminChecker) Metrics() (hits, misses prometheus.Counter) { + return c.hits, c.misses +} + +func (c *CachedAdminChecker) evictExpiredLocked(now time.Time) { + for k, v := range c.cache { + if now.After(v.expiresAt) { + delete(c.cache, k) + } + } +} + +//nolint:ireturn // prometheus.Counter is the canonical type for counters. +func registerOrReuseCounter(reg prometheus.Registerer, c prometheus.Counter) prometheus.Counter { + err := reg.Register(c) + if err == nil { + return c + } + var are prometheus.AlreadyRegisteredError + if errors.As(err, &are) { + existing, ok := are.ExistingCollector.(prometheus.Counter) + if !ok { + panic("existing collector is not a Counter") + } + return existing + } + panic(err) +} + +func cacheKey(user *token.UserContext) string { + sorted := make([]string, len(user.Groups)) + copy(sorted, user.Groups) + slices.Sort(sorted) + return user.Username + "\x00" + strings.Join(sorted, "\x00") +} diff --git a/maas-api/internal/auth/cached_admin_checker_test.go b/maas-api/internal/auth/cached_admin_checker_test.go new file mode 100644 index 000000000..a0ae108cb --- /dev/null +++ b/maas-api/internal/auth/cached_admin_checker_test.go @@ -0,0 +1,395 @@ +package auth_test + +import ( + "context" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + testingclock "k8s.io/utils/clock/testing" + + "github.com/opendatahub-io/models-as-a-service/maas-api/internal/auth" + "github.com/opendatahub-io/models-as-a-service/maas-api/internal/token" +) + +type mockDelegate struct { + mu sync.Mutex + calls int + response bool + err error +} + +func (m *mockDelegate) IsAdmin(_ context.Context, _ *token.UserContext) (bool, error) { + m.mu.Lock() + defer m.mu.Unlock() + m.calls++ + return m.response, m.err +} + +func (m *mockDelegate) getCalls() int { + m.mu.Lock() + defer m.mu.Unlock() + return m.calls +} + +func counterValue(t *testing.T, c prometheus.Counter) float64 { + t.Helper() + metric, ok := c.(prometheus.Metric) + require.True(t, ok, "counter does not implement prometheus.Metric") + var m dto.Metric + require.NoError(t, metric.Write(&m)) + return m.GetCounter().GetValue() +} + +func newTestChecker(delegate *mockDelegate) *auth.CachedAdminChecker { + reg := prometheus.NewRegistry() + return auth.NewCachedAdminChecker(delegate, time.Minute, 2*time.Second, 8192, reg, nil) +} + +func TestCachedAdminChecker_CacheHit(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + result, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.True(t, result) + + result, err = checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.True(t, result) + + assert.Equal(t, 1, delegate.getCalls(), "delegate should be called only once") +} + +func TestCachedAdminChecker_CacheMiss(t *testing.T) { + delegate := &mockDelegate{response: false} + checker := newTestChecker(delegate) + user := &token.UserContext{Username: "bob", Groups: []string{"users"}} + + result, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.False(t, result) + assert.Equal(t, 1, delegate.getCalls()) +} + +func TestCachedAdminChecker_TTLExpiry(t *testing.T) { + delegate := &mockDelegate{response: true} + reg := prometheus.NewRegistry() + fakeClock := testingclock.NewFakeClock(time.Now()) + checker := auth.NewCachedAdminChecker(delegate, 30*time.Second, 2*time.Second, 8192, reg, fakeClock) + + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + _, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.Equal(t, 1, delegate.getCalls()) + + _, err = checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.Equal(t, 1, delegate.getCalls(), "should use cache before TTL") + + fakeClock.Step(31 * time.Second) + + _, err = checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.Equal(t, 2, delegate.getCalls(), "should call delegate after TTL expiry") +} + +func TestCachedAdminChecker_DifferentUsers(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + + alice := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + bob := &token.UserContext{Username: "bob", Groups: []string{"admins"}} + + _, _ = checker.IsAdmin(context.Background(), alice) + _, _ = checker.IsAdmin(context.Background(), bob) + + assert.Equal(t, 2, delegate.getCalls(), "different users should be separate cache entries") +} + +func TestCachedAdminChecker_SameUserDifferentGroups(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + + adminAlice := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + userAlice := &token.UserContext{Username: "alice", Groups: []string{"users"}} + + _, _ = checker.IsAdmin(context.Background(), adminAlice) + _, _ = checker.IsAdmin(context.Background(), userAlice) + + assert.Equal(t, 2, delegate.getCalls(), "same user with different groups should be separate cache entries") +} + +func TestCachedAdminChecker_GroupOrderIrrelevant(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + + user1 := &token.UserContext{Username: "alice", Groups: []string{"b", "a", "c"}} + user2 := &token.UserContext{Username: "alice", Groups: []string{"c", "a", "b"}} + + _, _ = checker.IsAdmin(context.Background(), user1) + _, _ = checker.IsAdmin(context.Background(), user2) + + assert.Equal(t, 1, delegate.getCalls(), "group order should not matter for cache key") +} + +func TestCachedAdminChecker_NilUserReturnsFalse(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + + result, err := checker.IsAdmin(context.Background(), nil) + assert.NoError(t, err) + assert.False(t, result) + assert.Equal(t, 0, delegate.getCalls()) +} + +func TestCachedAdminChecker_EmptyUsernameReturnsFalse(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + + user := &token.UserContext{Username: "", Groups: []string{"admins"}} + result, err := checker.IsAdmin(context.Background(), user) + assert.NoError(t, err) + assert.False(t, result) + assert.Equal(t, 0, delegate.getCalls()) +} + +func TestCachedAdminChecker_NilCheckerReturnsFalse(t *testing.T) { + var checker *auth.CachedAdminChecker + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + result, err := checker.IsAdmin(context.Background(), user) + assert.NoError(t, err) + assert.False(t, result) +} + +func TestCachedAdminChecker_Metrics(t *testing.T) { + delegate := &mockDelegate{response: true} + reg := prometheus.NewRegistry() + checker := auth.NewCachedAdminChecker(delegate, time.Minute, 2*time.Second, 8192, reg, nil) + + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + _, _ = checker.IsAdmin(context.Background(), user) + + hits, misses := checker.Metrics() + assert.InDelta(t, 0, counterValue(t, hits), 0) + assert.InDelta(t, 1, counterValue(t, misses), 0) + + _, _ = checker.IsAdmin(context.Background(), user) + + assert.InDelta(t, 1, counterValue(t, hits), 0) + assert.InDelta(t, 1, counterValue(t, misses), 0) +} + +func TestCachedAdminChecker_ConcurrentAccess(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + var wg sync.WaitGroup + var trueCount atomic.Int64 + + for range 100 { + wg.Go(func() { + result, err := checker.IsAdmin(context.Background(), user) + assert.NoError(t, err) + if result { + trueCount.Add(1) + } + }) + } + + wg.Wait() + assert.Equal(t, int64(100), trueCount.Load(), "all calls should return true") +} + +func TestCachedAdminChecker_NilDelegatePanics(t *testing.T) { + reg := prometheus.NewRegistry() + assert.Panics(t, func() { + auth.NewCachedAdminChecker(nil, time.Minute, 2*time.Second, 8192, reg, nil) + }) +} + +func TestCachedAdminChecker_NonPositiveTTLPanics(t *testing.T) { + delegate := &mockDelegate{response: true} + reg := prometheus.NewRegistry() + assert.Panics(t, func() { + auth.NewCachedAdminChecker(delegate, 0, 2*time.Second, 8192, reg, nil) + }) +} + +func TestCachedAdminChecker_NonPositiveNegativeTTLPanics(t *testing.T) { + delegate := &mockDelegate{response: true} + reg := prometheus.NewRegistry() + assert.Panics(t, func() { + auth.NewCachedAdminChecker(delegate, time.Minute, 0, 8192, reg, nil) + }) +} + +func TestCachedAdminChecker_NonPositiveMaxSizePanics(t *testing.T) { + delegate := &mockDelegate{response: true} + reg := prometheus.NewRegistry() + assert.Panics(t, func() { + auth.NewCachedAdminChecker(delegate, time.Minute, 2*time.Second, 0, reg, nil) + }) +} + +func TestCachedAdminChecker_MaxSizeEnforced(t *testing.T) { + delegate := &mockDelegate{response: true} + reg := prometheus.NewRegistry() + fakeClock := testingclock.NewFakeClock(time.Now()) + checker := auth.NewCachedAdminChecker(delegate, time.Minute, 2*time.Second, 2, reg, fakeClock) + + user1 := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + user2 := &token.UserContext{Username: "bob", Groups: []string{"admins"}} + user3 := &token.UserContext{Username: "charlie", Groups: []string{"admins"}} + + // Fill cache to max (2 entries) + _, _ = checker.IsAdmin(context.Background(), user1) + _, _ = checker.IsAdmin(context.Background(), user2) + assert.Equal(t, 2, delegate.getCalls()) + + // Third user: cache is full, insert is skipped but result is still returned + result, err := checker.IsAdmin(context.Background(), user3) + require.NoError(t, err) + assert.True(t, result) + assert.Equal(t, 3, delegate.getCalls()) + + // user3 was not cached, so calling again should hit delegate + _, _ = checker.IsAdmin(context.Background(), user3) + assert.Equal(t, 4, delegate.getCalls(), "uncached entry should call delegate again") + + // user1 is still cached + _, _ = checker.IsAdmin(context.Background(), user1) + assert.Equal(t, 4, delegate.getCalls(), "cached entry should not call delegate") +} + +func TestCachedAdminChecker_FalseResultIsCached(t *testing.T) { + delegate := &mockDelegate{response: false} + checker := newTestChecker(delegate) + user := &token.UserContext{Username: "alice", Groups: []string{"users"}} + + result, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.False(t, result) + + result, err = checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.False(t, result) + + assert.Equal(t, 1, delegate.getCalls(), "false results should also be cached") +} + +func TestCachedAdminChecker_EvictsExpiredEntries(t *testing.T) { + delegate := &mockDelegate{response: true} + reg := prometheus.NewRegistry() + fakeClock := testingclock.NewFakeClock(time.Now()) + checker := auth.NewCachedAdminChecker(delegate, 10*time.Second, 2*time.Second, 8192, reg, fakeClock) + + user1 := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + user2 := &token.UserContext{Username: "bob", Groups: []string{"admins"}} + + // Cache user1 at t=0 + _, err := checker.IsAdmin(context.Background(), user1) + require.NoError(t, err) + require.Equal(t, 1, delegate.getCalls()) + + // At t=11s user1's entry is expired; calling user2 triggers eviction of user1 + fakeClock.Step(11 * time.Second) + _, err = checker.IsAdmin(context.Background(), user2) + require.NoError(t, err) + require.Equal(t, 2, delegate.getCalls()) + + // user1's entry was evicted, so this should call the delegate again + _, err = checker.IsAdmin(context.Background(), user1) + require.NoError(t, err) + assert.Equal(t, 3, delegate.getCalls(), "evicted entry should require fresh delegate call") +} + +func TestCachedAdminChecker_DelegateErrorNotCached(t *testing.T) { + delegate := &mockDelegate{response: false, err: assert.AnError} + checker := newTestChecker(delegate) + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + result, err := checker.IsAdmin(context.Background(), user) + assert.Error(t, err) + assert.False(t, result) + assert.Equal(t, 1, delegate.getCalls()) + + // Fix the delegate + delegate.mu.Lock() + delegate.response = true + delegate.err = nil + delegate.mu.Unlock() + + // Should call delegate again since error result was not cached + result, err = checker.IsAdmin(context.Background(), user) + assert.NoError(t, err) + assert.True(t, result) + assert.Equal(t, 2, delegate.getCalls()) +} + +func TestCachedAdminChecker_AsymmetricTTL(t *testing.T) { + delegate := &mockDelegate{response: false} + reg := prometheus.NewRegistry() + fakeClock := testingclock.NewFakeClock(time.Now()) + checker := auth.NewCachedAdminChecker(delegate, 30*time.Second, 2*time.Second, 8192, reg, fakeClock) + + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + // First call: not admin, cached with negative TTL (2s) + result, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.False(t, result) + assert.Equal(t, 1, delegate.getCalls()) + + // Within negative TTL: still cached + fakeClock.Step(1 * time.Second) + result, err = checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.False(t, result) + assert.Equal(t, 1, delegate.getCalls(), "should use cache within negative TTL") + + // After negative TTL: cache expired, call delegate again + fakeClock.Step(2 * time.Second) + + // Change delegate to return true + delegate.mu.Lock() + delegate.response = true + delegate.mu.Unlock() + + result, err = checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.True(t, result) + assert.Equal(t, 2, delegate.getCalls(), "should call delegate after negative TTL expiry") + + // Now true result cached with positive TTL (30s): verify it persists + fakeClock.Step(10 * time.Second) + result, err = checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.True(t, result) + assert.Equal(t, 2, delegate.getCalls(), "true result should be cached with long TTL") +} + +func TestCachedAdminChecker_CanceledContextReturnsError(t *testing.T) { + delegate := &mockDelegate{response: true} + checker := newTestChecker(delegate) + user := &token.UserContext{Username: "alice", Groups: []string{"admins"}} + + ctx, cancel := context.WithCancel(context.Background()) + cancel() + + result, err := checker.IsAdmin(ctx, user) + assert.False(t, result) + assert.ErrorIs(t, err, context.Canceled) + assert.Equal(t, 0, delegate.getCalls(), "delegate should not be called with canceled context") +} diff --git a/maas-api/internal/auth/sar_admin_checker.go b/maas-api/internal/auth/sar_admin_checker.go index b5544fc6c..eee235500 100644 --- a/maas-api/internal/auth/sar_admin_checker.go +++ b/maas-api/internal/auth/sar_admin_checker.go @@ -2,7 +2,7 @@ package auth import ( "context" - "log/slog" + "fmt" authv1 "k8s.io/api/authorization/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -37,10 +37,9 @@ func NewSARAdminChecker(client kubernetes.Interface, namespace string) *SARAdmin // IsAdmin checks if the user can create maasauthpolicies in the configured namespace. // This is a proxy for "is this user an admin" based on RBAC permissions. -// Returns false (fail-closed) if the check cannot be performed. -func (s *SARAdminChecker) IsAdmin(ctx context.Context, user *token.UserContext) bool { +func (s *SARAdminChecker) IsAdmin(ctx context.Context, user *token.UserContext) (bool, error) { if s == nil || s.client == nil || user == nil || user.Username == "" { - return false + return false, nil } sar := &authv1.SubjectAccessReview{ @@ -58,9 +57,8 @@ func (s *SARAdminChecker) IsAdmin(ctx context.Context, user *token.UserContext) result, err := s.client.AuthorizationV1().SubjectAccessReviews().Create(ctx, sar, metav1.CreateOptions{}) if err != nil { - slog.Warn("SAR admin check failed", "error", err.Error()) - return false + return false, fmt.Errorf("SAR admin check: %w", err) } - return result.Status.Allowed + return result.Status.Allowed, nil } diff --git a/maas-api/internal/auth/sar_admin_checker_test.go b/maas-api/internal/auth/sar_admin_checker_test.go index a0b2a345e..d1131e60d 100644 --- a/maas-api/internal/auth/sar_admin_checker_test.go +++ b/maas-api/internal/auth/sar_admin_checker_test.go @@ -30,7 +30,9 @@ func TestSARAdminChecker_IsAdmin(t *testing.T) { checker := auth.NewSARAdminChecker(client, testNamespace) user := &token.UserContext{Username: "admin-user", Groups: []string{"admin-group"}} - assert.True(t, checker.IsAdmin(context.Background(), user)) + result, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.True(t, result) }) t.Run("RegularUserDenied", func(t *testing.T) { @@ -45,14 +47,18 @@ func TestSARAdminChecker_IsAdmin(t *testing.T) { checker := auth.NewSARAdminChecker(client, testNamespace) user := &token.UserContext{Username: "regular-user", Groups: []string{"users"}} - assert.False(t, checker.IsAdmin(context.Background(), user)) + result, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) + assert.False(t, result) }) t.Run("NilUserReturnsFalse", func(t *testing.T) { client := fake.NewSimpleClientset() checker := auth.NewSARAdminChecker(client, testNamespace) - assert.False(t, checker.IsAdmin(context.Background(), nil)) + result, err := checker.IsAdmin(context.Background(), nil) + assert.NoError(t, err) + assert.False(t, result) }) t.Run("EmptyUsernameReturnsFalse", func(t *testing.T) { @@ -60,14 +66,18 @@ func TestSARAdminChecker_IsAdmin(t *testing.T) { checker := auth.NewSARAdminChecker(client, testNamespace) user := &token.UserContext{Username: "", Groups: []string{"admin-group"}} - assert.False(t, checker.IsAdmin(context.Background(), user)) + result, err := checker.IsAdmin(context.Background(), user) + assert.NoError(t, err) + assert.False(t, result) }) t.Run("NilCheckerReturnsFalse", func(t *testing.T) { var checker *auth.SARAdminChecker user := &token.UserContext{Username: "admin-user", Groups: []string{"admin-group"}} - assert.False(t, checker.IsAdmin(context.Background(), user)) + result, err := checker.IsAdmin(context.Background(), user) + assert.NoError(t, err) + assert.False(t, result) }) t.Run("NilClientPanics", func(t *testing.T) { @@ -83,7 +93,7 @@ func TestSARAdminChecker_IsAdmin(t *testing.T) { }) }) - t.Run("APIErrorReturnsFalse_FailClosed", func(t *testing.T) { + t.Run("APIErrorReturnsError", func(t *testing.T) { client := fake.NewSimpleClientset() client.PrependReactor("create", "subjectaccessreviews", func(action k8stesting.Action) (bool, runtime.Object, error) { return true, nil, assert.AnError @@ -92,7 +102,9 @@ func TestSARAdminChecker_IsAdmin(t *testing.T) { checker := auth.NewSARAdminChecker(client, testNamespace) user := &token.UserContext{Username: "admin-user", Groups: []string{"admin-group"}} - assert.False(t, checker.IsAdmin(context.Background(), user), "should fail-closed on API error") + result, err := checker.IsAdmin(context.Background(), user) + assert.False(t, result, "should fail-closed on API error") + assert.Error(t, err, "should return error on API failure") }) t.Run("VerifiesSARParameters", func(t *testing.T) { @@ -109,7 +121,8 @@ func TestSARAdminChecker_IsAdmin(t *testing.T) { checker := auth.NewSARAdminChecker(client, testNamespace) user := &token.UserContext{Username: "alice", Groups: []string{"group1", "group2"}} - checker.IsAdmin(context.Background(), user) + _, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) require.NotNil(t, capturedSAR) assert.Equal(t, "alice", capturedSAR.Spec.User) @@ -134,7 +147,8 @@ func TestSARAdminChecker_IsAdmin(t *testing.T) { checker := auth.NewSARAdminChecker(client, "custom-namespace") user := &token.UserContext{Username: "alice", Groups: []string{"users"}} - checker.IsAdmin(context.Background(), user) + _, err := checker.IsAdmin(context.Background(), user) + require.NoError(t, err) require.NotNil(t, capturedSAR) assert.Equal(t, "custom-namespace", capturedSAR.Spec.ResourceAttributes.Namespace) diff --git a/maas-api/internal/config/cluster_config.go b/maas-api/internal/config/cluster_config.go index 7336fd9f0..96c4c1fb4 100644 --- a/maas-api/internal/config/cluster_config.go +++ b/maas-api/internal/config/cluster_config.go @@ -29,7 +29,8 @@ type ClusterConfig struct { // AdminChecker uses SubjectAccessReview to check if a user is an admin. // Admin is determined by RBAC: can user create maasauthpolicies in the configured MaaS namespace? - AdminChecker *auth.SARAdminChecker + // Results are cached with a TTL to reduce Kubernetes API server load. + AdminChecker *auth.CachedAdminChecker informersSynced []cache.InformerSynced startFuncs []func(<-chan struct{}) @@ -78,7 +79,7 @@ func (s *subscriptionLister) List() ([]*unstructured.Unstructured, error) { return out, nil } -func NewClusterConfig(_ string, subscriptionNamespace string, resyncPeriod time.Duration) (*ClusterConfig, error) { +func NewClusterConfig(_ string, subscriptionNamespace string, resyncPeriod time.Duration, sarCacheMaxSize int) (*ClusterConfig, error) { restConfig, err := LoadRestConfig() if err != nil { return nil, fmt.Errorf("failed to create kubernetes config: %w", err) @@ -109,7 +110,9 @@ func NewClusterConfig(_ string, subscriptionNamespace string, resyncPeriod time. // SAR-based admin checker: uses SubjectAccessReview to check RBAC permissions. // Admin is determined by: can user create maasauthpolicies in the MaaS namespace? // This aligns with RBAC from opendatahub-operator#3301 which grants admin groups CRUD access to MaaS resources. - adminCheckerVal := auth.NewSARAdminChecker(clientset, subscriptionNamespace) + // Results are cached for 30s to reduce K8s API server load under high traffic. + sarChecker := auth.NewSARAdminChecker(clientset, subscriptionNamespace) + adminCheckerVal := auth.NewCachedAdminChecker(sarChecker, 30*time.Second, 2*time.Second, sarCacheMaxSize, nil, nil) return &ClusterConfig{ ClientSet: clientset, diff --git a/maas-api/internal/config/config.go b/maas-api/internal/config/config.go index f42b5b493..52e845982 100644 --- a/maas-api/internal/config/config.go +++ b/maas-api/internal/config/config.go @@ -52,6 +52,10 @@ type Config struct { // window are excluded (fail-closed). Default: 15 seconds. Minimum: 1 second. AccessCheckTimeoutSeconds int + // SARCacheMaxSize is the maximum number of entries in the SAR admin-check cache. + // Bounds memory usage under high-cardinality user traffic. Default: 8192. + SARCacheMaxSize int + // Deprecated flag (backward compatibility with pre-TLS version) deprecatedHTTPPort string } @@ -63,6 +67,7 @@ func Load() *Config { secure, _ := env.GetBool("SECURE", false) maxExpirationDays, _ := env.GetInt("API_KEY_MAX_EXPIRATION_DAYS", constant.DefaultAPIKeyMaxExpirationDays) accessCheckTimeoutSeconds, _ := env.GetInt("ACCESS_CHECK_TIMEOUT_SECONDS", 15) + sarCacheMaxSize, _ := env.GetInt("SAR_CACHE_MAX_SIZE", constant.DefaultSARCacheMaxSize) c := &Config{ Name: env.GetString("INSTANCE_NAME", gatewayName), @@ -77,6 +82,7 @@ func Load() *Config { DBConnectionURL: "", // Loaded from K8s secret via LoadDatabaseURL() APIKeyMaxExpirationDays: maxExpirationDays, AccessCheckTimeoutSeconds: accessCheckTimeoutSeconds, + SARCacheMaxSize: sarCacheMaxSize, // Deprecated env var (backward compatibility with pre-TLS version) deprecatedHTTPPort: env.GetString("PORT", ""), } diff --git a/maas-api/internal/constant/const.go b/maas-api/internal/constant/const.go index da0e5f5aa..4130bb4ae 100644 --- a/maas-api/internal/constant/const.go +++ b/maas-api/internal/constant/const.go @@ -18,6 +18,9 @@ const ( // DefaultAPIKeyMaxExpirationDays is the default maximum allowed expiration for API keys. DefaultAPIKeyMaxExpirationDays = 90 + // DefaultSARCacheMaxSize is the maximum number of entries in the SAR admin-check cache. + DefaultSARCacheMaxSize = 8192 + // LLMInferenceService annotation keys for model metadata. AnnotationGenAIUseCase = "opendatahub.io/genai-use-case" AnnotationDescription = "openshift.io/description"