@@ -283,11 +283,11 @@ func (mts *MonitoredTokenSource) Stopped() <-chan struct{} {
283283 return mts .stopped
284284}
285285
286- // Token retrieves a token, retrying with exponential backoff on transient errors
287- // (see isTransientNetworkError for the full list). On non-transient errors
288- // (OAuth 4xx, TLS failures) it marks the workload as unauthenticated and returns
289- // immediately. Context cancellation (workload removal) stops the retry without
290- // marking the workload as unauthenticated.
286+ // Token retrieves a token, retrying with exponential backoff on transient
287+ // errors and marking the workload as unauthenticated on non-transient errors.
288+ // See isTransientNetworkError for the classification rule. Context
289+ // cancellation (workload removal) stops the retry without marking the
290+ // workload as unauthenticated.
291291//
292292// Concurrent callers are deduplicated via singleflight so that only one retry
293293// loop runs at a time during transient failures.
@@ -382,13 +382,20 @@ func (mts *MonitoredTokenSource) onTick() (bool, time.Duration) {
382382 return false , wait
383383}
384384
385- // isTransientNetworkError reports whether err represents a transient condition
386- // (DNS failure, TCP transport error, timeout, OAuth server 5xx, unparsable
387- // token response) that is likely to resolve on its own.
385+ // isTransientNetworkError reports whether err represents a transient
386+ // condition that is likely to resolve on its own. The categories are:
388387//
389- // OAuth2 client-level auth failures (invalid_grant, 401, 400) and TLS errors
390- // (certificate verification, handshake failure) are NOT considered transient and
391- // return false so the workload is marked unauthenticated immediately.
388+ // - Network-level failures: DNS lookup errors, TCP transport errors,
389+ // timeouts.
390+ // - OAuth token-endpoint responses classified as transient by
391+ // isTransientRetrieveError.
392+ // - Unparsable token responses on a 2xx status (typically an HTML page
393+ // from a load balancer or CDN).
394+ //
395+ // All other errors return false, causing the workload to be marked
396+ // unauthenticated. TLS failures (certificate verification, handshake
397+ // failure) are intentionally non-transient even though they surface
398+ // through net.OpError like transport-level errors.
392399//
393400// The function is side-effect free; callers that want to emit a DCR
394401// remediation hint on a permanent 4xx must do so themselves at the
@@ -400,17 +407,8 @@ func isTransientNetworkError(err error) bool {
400407 return false
401408 }
402409
403- // OAuth HTTP-level errors: 5xx (Bad Gateway, Service Unavailable, Gateway
404- // Timeout) are transient server-side issues that typically resolve on their
405- // own. 4xx errors (invalid_grant, invalid_client) are permanent auth failures.
406410 if retrieveErr , ok := errors.AsType [* oauth2.RetrieveError ](err ); ok {
407- if retrieveErr .Response != nil && retrieveErr .Response .StatusCode >= 500 {
408- slog .Debug ("treating OAuth server error as transient" ,
409- "status_code" , retrieveErr .Response .StatusCode ,
410- )
411- return true
412- }
413- return false
411+ return isTransientRetrieveError (retrieveErr )
414412 }
415413
416414 // Non-JSON responses from the OAuth server (e.g. load balancer HTML pages).
@@ -445,33 +443,76 @@ func isTransientNetworkError(err error) bool {
445443}
446444
447445// isPermanentTokenEndpointError reports whether err is an *oauth2.RetrieveError
448- // whose status implies the cached client credentials are themselves the
449- // problem — specifically 400 (invalid_grant / invalid_client), 401, or
450- // 403. Used at state-transition boundaries to decide whether to emit a
451- // DCR/CIMD remediation hint alongside the unauthentication.
446+ // whose response carries a structured RFC 6749 'error' code, implying the
447+ // OAuth server itself rendered a verdict on the cached credentials
448+ // (invalid_grant, invalid_client, etc.). Used at state-transition
449+ // boundaries to decide whether to emit a DCR/CIMD remediation hint
450+ // alongside the unauthentication.
452451//
453- // Other 4xx codes are intentionally NOT treated as permanent here even
454- // though isTransientNetworkError classifies the whole RetrieveError
455- // branch as non-transient. 408 (Request Timeout) and 429 (Too Many
456- // Requests) are typically transient back-pressure that the operator
457- // cannot remediate by deleting cached credentials; firing the
458- // "delete the cached credentials and restart" Warn on those would
459- // mislead operators chasing a transient hiccup. The narrower allowlist
460- // keeps the remediation hint truthful.
452+ // This is the strict inverse of isTransientRetrieveError on the
453+ // *oauth2.RetrieveError branch: a response is "permanent" iff the
454+ // classifier would NOT call it transient. Concretely, the Warn fires
455+ // only when ErrorCode is populated. 4xx responses without an OAuth
456+ // error code (HTML pages from a WAF, CDN, or reverse proxy) — like
457+ // 5xx, 429, 408, and nil-Response shapes — are treated as
458+ // non-permanent because we have no OAuth-protocol verdict to act on.
459+ // Recommending the user delete cached credentials based on a non-
460+ // spec-compliant response would frequently mislead operators whose
461+ // real problem is upstream of the OAuth server.
461462func isPermanentTokenEndpointError (err error ) bool {
462463 retrieveErr , ok := errors.AsType [* oauth2.RetrieveError ](err )
463- if ! ok {
464+ if ! ok || retrieveErr . Response == nil {
464465 return false
465466 }
467+ return ! isTransientRetrieveError (retrieveErr )
468+ }
469+
470+ // isTransientRetrieveError reports whether an *oauth2.RetrieveError should
471+ // be treated as transient. The classification rules are:
472+ //
473+ // - nil Response: non-transient. There is no signal to act on, so we fall
474+ // through to the unauthenticated path rather than retry blindly.
475+ // - 5xx status: transient (server-side issue, likely to resolve).
476+ // - 429 Too Many Requests: transient regardless of body (HTTP standard).
477+ // - 4xx with an empty ErrorCode: transient. The oauth2 library populates
478+ // ErrorCode from the RFC 6749 'error' field in a JSON response body. An
479+ // empty ErrorCode means the response was not a parseable OAuth error —
480+ // typically an HTML page from a WAF, CDN, or reverse proxy that
481+ // intercepted the request before it reached the OAuth server. These
482+ // infrastructure errors (Cloudflare blocks, residential-IP allowlist
483+ // misses, transient bad-config deploys) commonly resolve on their own.
484+ // - 4xx with a populated ErrorCode: permanent. The OAuth server returned
485+ // a structured error code (invalid_grant, invalid_client, etc.) telling
486+ // us specifically what's wrong; retrying won't help.
487+ func isTransientRetrieveError (retrieveErr * oauth2.RetrieveError ) bool {
466488 if retrieveErr .Response == nil {
467489 return false
468490 }
469- switch retrieveErr .Response .StatusCode {
470- case http .StatusBadRequest , http .StatusUnauthorized , http .StatusForbidden :
491+ statusCode := retrieveErr .Response .StatusCode
492+
493+ if statusCode >= 500 {
494+ slog .Debug ("treating OAuth server error as transient" ,
495+ "status_code" , statusCode ,
496+ )
471497 return true
472- default :
473- return false
474498 }
499+
500+ if statusCode == http .StatusTooManyRequests {
501+ slog .Debug ("treating OAuth rate-limit response as transient" ,
502+ "status_code" , statusCode ,
503+ "error_code" , retrieveErr .ErrorCode ,
504+ )
505+ return true
506+ }
507+
508+ if retrieveErr .ErrorCode == "" {
509+ slog .Debug ("treating OAuth 4xx without error code as transient" ,
510+ "status_code" , statusCode ,
511+ )
512+ return true
513+ }
514+
515+ return false
475516}
476517
477518// isOAuthParseError detects errors from the oauth2 library that indicate the
0 commit comments