Skip to content

Commit 7bfb2ad

Browse files
committed
operator: invalidate AIStore API client with failed auth token
Signed-off-by: Aaron Wilson <aawilson@nvidia.com>
1 parent d406acb commit 7bfb2ad

File tree

3 files changed

+208
-8
lines changed

3 files changed

+208
-8
lines changed

operator/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ We structure this changelog in accordance with [Keep a Changelog](https://keepac
1313
### Changed
1414

1515
- Fixed syncPodTemplate not syncing env var removals when the desired env slice was a prefix of the current, causing rollout loops on config changes such as clearing `authNSecretName`.
16+
- Invalidate the AIStore API client on authentication failure, not just on failure to acquire initial token.
1617

1718
## v2.15.0
1819

operator/pkg/services/aisapi.go

Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,45 @@ type (
4747
mode string
4848
tlsCfg *tls.Config
4949
tokenExpireAt time.Time
50+
authFailed bool
5051
}
5152
)
5253

54+
// IsAuthError returns true if the error is an HTTP 401 or 403 from the AIS API,
55+
// indicating the token is invalid or has been revoked.
56+
func IsAuthError(err error) bool {
57+
if err == nil {
58+
return false
59+
}
60+
herr, ok := err.(*cmn.ErrHTTP)
61+
if !ok {
62+
return false
63+
}
64+
return herr.Status == http.StatusUnauthorized || herr.Status == http.StatusForbidden
65+
}
66+
67+
// checkAuthErr inspects an error from an AIS API call and marks the client's
68+
// token as failed if the cluster responded with 401/403. This ensures the next
69+
// call to GetClient will discard the cached client and fetch a fresh token.
70+
func (c *AIStoreClient) checkAuthErr(err error) {
71+
if IsAuthError(err) {
72+
c.authFailed = true
73+
logf.FromContext(c.ctx).Info("AIS API returned auth error, token will be refreshed on next reconcile")
74+
}
75+
}
76+
5377
// HasValidBaseParams checks if the client has valid params for the given AIS cluster configuration
5478
func (c *AIStoreClient) HasValidBaseParams(ctx context.Context, ais *aisv1.AIStore, expectedURL string) bool {
5579
if c.params == nil {
5680
return false
5781
}
82+
83+
// If a previous API call returned 401/403, force token refresh
84+
if c.authFailed {
85+
logf.FromContext(ctx).Info("Token previously rejected by AIS (401/403), recreating client")
86+
return false
87+
}
88+
5889
// Check if the URL has changed
5990
if c.params.URL != expectedURL {
6091
return false
@@ -114,35 +145,51 @@ func (c *AIStoreClient) refreshToken(tokenInfo *TokenInfo) {
114145
}
115146

116147
func (c *AIStoreClient) DecommissionCluster(rmUserData bool) error {
117-
return api.DecommissionCluster(*c.params, rmUserData)
148+
err := api.DecommissionCluster(*c.params, rmUserData)
149+
c.checkAuthErr(err)
150+
return err
118151
}
119152

120153
func (c *AIStoreClient) DecommissionNode(actValue *apc.ActValRmNode) (string, error) {
121-
return api.DecommissionNode(*c.params, actValue)
154+
xid, err := api.DecommissionNode(*c.params, actValue)
155+
c.checkAuthErr(err)
156+
return xid, err
122157
}
123158

124159
func (c *AIStoreClient) GetClusterMap() (smap *meta.Smap, err error) {
125-
return api.GetClusterMap(*c.params)
160+
smap, err = api.GetClusterMap(*c.params)
161+
c.checkAuthErr(err)
162+
return
126163
}
127164

128165
func (c *AIStoreClient) Health(readyToRebalance bool) error {
129-
return api.Health(*c.params, readyToRebalance)
166+
err := api.Health(*c.params, readyToRebalance)
167+
c.checkAuthErr(err)
168+
return err
130169
}
131170

132171
func (c *AIStoreClient) SetClusterConfigUsingMsg(config *cmn.ConfigToSet, transient bool) error {
133-
return api.SetClusterConfigUsingMsg(*c.params, config, transient)
172+
err := api.SetClusterConfigUsingMsg(*c.params, config, transient)
173+
c.checkAuthErr(err)
174+
return err
134175
}
135176

136177
func (c *AIStoreClient) SetPrimaryProxy(newPrimaryID, newPrimaryURL string, force bool) error {
137-
return api.SetPrimary(*c.params, newPrimaryID, newPrimaryURL, force)
178+
err := api.SetPrimary(*c.params, newPrimaryID, newPrimaryURL, force)
179+
c.checkAuthErr(err)
180+
return err
138181
}
139182

140183
func (c *AIStoreClient) ShutdownCluster() error {
141-
return api.ShutdownCluster(*c.params)
184+
err := api.ShutdownCluster(*c.params)
185+
c.checkAuthErr(err)
186+
return err
142187
}
143188

144189
func (c *AIStoreClient) StartMaintenance(actValue *apc.ActValRmNode) (string, error) {
145-
return api.StartMaintenance(*c.params, actValue)
190+
xid, err := api.StartMaintenance(*c.params, actValue)
191+
c.checkAuthErr(err)
192+
return xid, err
146193
}
147194

148195
func NewAIStoreClient(ctx context.Context, url string, tokenInfo *TokenInfo, mode string, tlsCfg *tls.Config) *AIStoreClient {

operator/pkg/services/token_expiration_test.go

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,12 @@ package services
66

77
import (
88
"context"
9+
"errors"
10+
"net/http"
911
"testing"
1012
"time"
1113

14+
"github.com/NVIDIA/aistore/cmn"
1215
aisv1 "github.com/ais-operator/api/v1beta1"
1316
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1417
)
@@ -118,3 +121,152 @@ func TestTokenInfoStructure(t *testing.T) {
118121
}
119122
})
120123
}
124+
125+
func TestIsAuthError(t *testing.T) {
126+
tests := []struct {
127+
name string
128+
err error
129+
expected bool
130+
}{
131+
{
132+
name: "nil error",
133+
err: nil,
134+
expected: false,
135+
},
136+
{
137+
name: "non-HTTP error",
138+
err: errors.New("connection refused"),
139+
expected: false,
140+
},
141+
{
142+
name: "401 Unauthorized",
143+
err: &cmn.ErrHTTP{
144+
Status: http.StatusUnauthorized,
145+
},
146+
expected: true,
147+
},
148+
{
149+
name: "403 Forbidden",
150+
err: &cmn.ErrHTTP{
151+
Status: http.StatusForbidden,
152+
},
153+
expected: true,
154+
},
155+
{
156+
name: "404 Not Found",
157+
err: &cmn.ErrHTTP{
158+
Status: http.StatusNotFound,
159+
},
160+
expected: false,
161+
},
162+
{
163+
name: "503 Service Unavailable",
164+
err: &cmn.ErrHTTP{
165+
Status: http.StatusServiceUnavailable,
166+
},
167+
expected: false,
168+
},
169+
{
170+
name: "500 Internal Server Error",
171+
err: &cmn.ErrHTTP{
172+
Status: http.StatusInternalServerError,
173+
},
174+
expected: false,
175+
},
176+
}
177+
178+
for _, tt := range tests {
179+
t.Run(tt.name, func(t *testing.T) {
180+
result := IsAuthError(tt.err)
181+
if result != tt.expected {
182+
t.Errorf("IsAuthError(%v) = %v, want %v", tt.err, result, tt.expected)
183+
}
184+
})
185+
}
186+
}
187+
188+
func TestAuthFailedInvalidatesClient(t *testing.T) {
189+
ctx := context.Background()
190+
testURL := "http://test:8080"
191+
192+
ais := &aisv1.AIStore{
193+
ObjectMeta: metav1.ObjectMeta{
194+
Name: "test-cluster",
195+
Namespace: "default",
196+
},
197+
Spec: aisv1.AIStoreSpec{},
198+
}
199+
200+
t.Run("client valid before auth failure", func(t *testing.T) {
201+
client := &AIStoreClient{
202+
ctx: ctx,
203+
params: buildBaseParams(testURL, "", nil),
204+
mode: "",
205+
}
206+
207+
if !client.HasValidBaseParams(ctx, ais, testURL) {
208+
t.Error("Expected client to be valid before any auth failure")
209+
}
210+
})
211+
212+
t.Run("client invalid after auth failure", func(t *testing.T) {
213+
client := &AIStoreClient{
214+
ctx: ctx,
215+
params: buildBaseParams(testURL, "", nil),
216+
mode: "",
217+
}
218+
219+
// Simulate receiving a 401 error
220+
authErr := &cmn.ErrHTTP{Status: http.StatusUnauthorized}
221+
client.checkAuthErr(authErr)
222+
223+
if client.HasValidBaseParams(ctx, ais, testURL) {
224+
t.Error("Expected client to be invalid after 401 error")
225+
}
226+
})
227+
228+
t.Run("non-auth error does not invalidate client", func(t *testing.T) {
229+
client := &AIStoreClient{
230+
ctx: ctx,
231+
params: buildBaseParams(testURL, "", nil),
232+
mode: "",
233+
}
234+
235+
// Simulate receiving a 500 error
236+
serverErr := &cmn.ErrHTTP{Status: http.StatusInternalServerError}
237+
client.checkAuthErr(serverErr)
238+
239+
if !client.HasValidBaseParams(ctx, ais, testURL) {
240+
t.Error("Expected client to remain valid after non-auth error")
241+
}
242+
})
243+
244+
t.Run("nil error does not invalidate client", func(t *testing.T) {
245+
client := &AIStoreClient{
246+
ctx: ctx,
247+
params: buildBaseParams(testURL, "", nil),
248+
mode: "",
249+
}
250+
251+
client.checkAuthErr(nil)
252+
253+
if !client.HasValidBaseParams(ctx, ais, testURL) {
254+
t.Error("Expected client to remain valid after nil error")
255+
}
256+
})
257+
258+
t.Run("403 Forbidden invalidates client", func(t *testing.T) {
259+
client := &AIStoreClient{
260+
ctx: ctx,
261+
params: buildBaseParams(testURL, "", nil),
262+
mode: "",
263+
}
264+
265+
authErr := &cmn.ErrHTTP{Status: http.StatusForbidden}
266+
client.checkAuthErr(authErr)
267+
268+
if client.HasValidBaseParams(ctx, ais, testURL) {
269+
t.Error("Expected client to be invalid after 403 error")
270+
}
271+
})
272+
}

0 commit comments

Comments
 (0)