@@ -3571,6 +3571,12 @@ def get_httpx_client_factory(
35713571 # Use isolated client for gateway health checks (each gateway may have custom CA cert)
35723572 # Use admin timeout for health checks (fail fast, don't wait 120s for slow upstreams)
35733573 # Pass ssl_context if present, otherwise let get_isolated_http_client use skip_ssl_verify setting
3574+ # Track whether this is an unauthenticated liveness probe
3575+ # (authorization_code gateway without system token). When True,
3576+ # HTTP 401/403 responses are treated as "server alive" rather
3577+ # than health-check failures.
3578+ unauthenticated_probe = False
3579+
35743580 async with get_isolated_http_client (timeout = settings .httpx_admin_read_timeout , verify = ssl_context ) as client :
35753581 logger .debug (f"Checking health of gateway: { gateway_name } ({ gateway_url_sanitized } )" )
35763582 try :
@@ -3581,40 +3587,40 @@ def get_httpx_client_factory(
35813587 grant_type = gateway_oauth_config .get ("grant_type" , "client_credentials" )
35823588
35833589 if grant_type == "authorization_code" :
3584- # For Authorization Code flow, try to get stored tokens
3590+ # Authorization Code flow requires an interactive user
3591+ # to complete the OAuth dance. The health-check runs
3592+ # under a system identity (platform_admin_email) which
3593+ # typically has NO stored token for these gateways.
3594+ #
3595+ # Strategy: try to use a stored token if available;
3596+ # otherwise proceed WITHOUT auth. An unauthenticated
3597+ # probe still tests connectivity:
3598+ # - 401/403 → server is alive (auth rejected, not down)
3599+ # - timeout/DNS/connection error → real outage
3600+ # This avoids the old behaviour of marking the gateway
3601+ # as failed just because the system account has no token.
3602+ access_token = None
35853603 try :
35863604 # First-Party
35873605 from mcpgateway .services .token_storage_service import TokenStorageService # pylint: disable=import-outside-toplevel
35883606
3589- # Use fresh session for OAuth token lookup
3590- with fresh_db_session () as token_db :
3591- token_storage = TokenStorageService (token_db )
3592-
3593- # Get user-specific OAuth token
3594- if not user_email :
3595- if span :
3596- set_span_attribute (span , "health.status" , "unhealthy" )
3597- set_span_error (span , "User email required for OAuth token" )
3598- await self ._handle_gateway_failure (gateway )
3599- return
3600-
3601- access_token = await token_storage .get_user_token (gateway_id , user_email )
3602-
3603- if access_token :
3604- headers ["Authorization" ] = f"Bearer { access_token } "
3605- else :
3606- if span :
3607- set_span_attribute (span , "health.status" , "unhealthy" )
3608- set_span_error (span , "No valid OAuth token for user" )
3609- await self ._handle_gateway_failure (gateway )
3610- return
3607+ if user_email :
3608+ with fresh_db_session () as token_db :
3609+ token_storage = TokenStorageService (token_db )
3610+ access_token = await token_storage .get_user_token (gateway_id , user_email )
36113611 except Exception as e :
3612- logger .error (f"Failed to obtain stored OAuth token for gateway { gateway_name } : { e } " )
3613- if span :
3614- set_span_attribute (span , "health.status" , "unhealthy" )
3615- set_span_error (span , "Failed to obtain stored OAuth token" )
3616- await self ._handle_gateway_failure (gateway )
3617- return
3612+ logger .debug (f"Could not look up OAuth token for health check on { gateway_name } : { e } " )
3613+
3614+ if access_token :
3615+ headers ["Authorization" ] = f"Bearer { access_token } "
3616+ else :
3617+ # No token — proceed without auth. The probe will
3618+ # likely get 401/403 which is fine (proves liveness).
3619+ unauthenticated_probe = True
3620+ logger .debug (
3621+ f"Health-checking authorization_code gateway "
3622+ f"{ gateway_name } without auth (no system token)"
3623+ )
36183624 else :
36193625 # For Client Credentials flow, get token directly
36203626 try :
@@ -3636,12 +3642,17 @@ def get_httpx_client_factory(
36363642 else :
36373643 headers = {}
36383644
3639- # Perform the GET and raise on 4xx/5xx
3645+ # Perform the actual connectivity probe.
3646+ # For unauthenticated probes (auth_code gateways without
3647+ # a system token), 401/403 proves the server is alive —
3648+ # only network errors indicate a real outage.
36403649 if (gateway_transport ).lower () == "sse" :
36413650 timeout = httpx .Timeout (settings .health_check_timeout )
36423651 async with client .stream ("GET" , gateway_url , headers = headers , timeout = timeout ) as response :
3643- # This will raise immediately if status is 4xx/5xx
3644- response .raise_for_status ()
3652+ if unauthenticated_probe and response .status_code in (401 , 403 ):
3653+ logger .debug (f"Gateway { gateway_name } returned { response .status_code } (auth rejected, server alive)" )
3654+ else :
3655+ response .raise_for_status ()
36453656 if span :
36463657 set_span_attribute (span , "http.status_code" , response .status_code )
36473658 elif (gateway_transport ).lower () == "streamablehttp" :
@@ -3698,7 +3709,10 @@ def get_httpx_client_factory(
36983709 }
36993710 timeout = httpx .Timeout (settings .health_check_timeout )
37003711 response = await client .post (gateway_url , json = init_payload , headers = init_headers , timeout = timeout )
3701- response .raise_for_status ()
3712+ if unauthenticated_probe and response .status_code in (401 , 403 ):
3713+ logger .debug (f"Gateway { gateway_name } returned { response .status_code } (auth rejected, server alive)" )
3714+ else :
3715+ response .raise_for_status ()
37023716
37033717 # Reactivate gateway if it was previously inactive and health check passed now
37043718 if gateway_enabled and not gateway_reachable :
@@ -3776,6 +3790,8 @@ def get_httpx_client_factory(
37763790 if span :
37773791 set_span_attribute (span , "health.status" , "healthy" )
37783792 set_span_attribute (span , "success" , True )
3793+ if unauthenticated_probe :
3794+ set_span_attribute (span , "health.probe_type" , "unauthenticated_liveness" )
37793795
37803796 except Exception as e :
37813797 if span :
0 commit comments