Skip to content

Commit 099f3f0

Browse files
author
Olivier Gintrand
committed
fix: unauthenticated liveness probe for authorization_code gateways
The health check runs under platform_admin_email (a bootstrap service account) which can never complete OAuth authorization_code flows. Previously, this caused _handle_gateway_failure to be called, marking gateways unreachable within ~3 minutes of any manual re-authorization. Instead of skipping the health check entirely (which would leave no outage detection), send an unauthenticated connectivity probe: - 401/403 = server is alive (auth rejected, not down) → reachable - timeout / DNS / connection error = real outage → failure counter This preserves genuine outage detection while preventing false deactivation due to missing system-level OAuth tokens. If a valid token IS available for platform_admin_email (e.g. it matches a real interactive user), the full authenticated health check runs as before.
1 parent 46a2622 commit 099f3f0

File tree

1 file changed

+49
-33
lines changed

1 file changed

+49
-33
lines changed

mcpgateway/services/gateway_service.py

Lines changed: 49 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3571,6 +3571,12 @@ def get_httpx_client_factory(
35713571
# Use isolated client for gateway health checks (each gateway may have custom CA cert)
35723572
# Use admin timeout for health checks (fail fast, don't wait 120s for slow upstreams)
35733573
# Pass ssl_context if present, otherwise let get_isolated_http_client use skip_ssl_verify setting
3574+
# Track whether this is an unauthenticated liveness probe
3575+
# (authorization_code gateway without system token). When True,
3576+
# HTTP 401/403 responses are treated as "server alive" rather
3577+
# than health-check failures.
3578+
unauthenticated_probe = False
3579+
35743580
async with get_isolated_http_client(timeout=settings.httpx_admin_read_timeout, verify=ssl_context) as client:
35753581
logger.debug(f"Checking health of gateway: {gateway_name} ({gateway_url_sanitized})")
35763582
try:
@@ -3581,40 +3587,40 @@ def get_httpx_client_factory(
35813587
grant_type = gateway_oauth_config.get("grant_type", "client_credentials")
35823588

35833589
if grant_type == "authorization_code":
3584-
# For Authorization Code flow, try to get stored tokens
3590+
# Authorization Code flow requires an interactive user
3591+
# to complete the OAuth dance. The health-check runs
3592+
# under a system identity (platform_admin_email) which
3593+
# typically has NO stored token for these gateways.
3594+
#
3595+
# Strategy: try to use a stored token if available;
3596+
# otherwise proceed WITHOUT auth. An unauthenticated
3597+
# probe still tests connectivity:
3598+
# - 401/403 → server is alive (auth rejected, not down)
3599+
# - timeout/DNS/connection error → real outage
3600+
# This avoids the old behaviour of marking the gateway
3601+
# as failed just because the system account has no token.
3602+
access_token = None
35853603
try:
35863604
# First-Party
35873605
from mcpgateway.services.token_storage_service import TokenStorageService # pylint: disable=import-outside-toplevel
35883606

3589-
# Use fresh session for OAuth token lookup
3590-
with fresh_db_session() as token_db:
3591-
token_storage = TokenStorageService(token_db)
3592-
3593-
# Get user-specific OAuth token
3594-
if not user_email:
3595-
if span:
3596-
set_span_attribute(span, "health.status", "unhealthy")
3597-
set_span_error(span, "User email required for OAuth token")
3598-
await self._handle_gateway_failure(gateway)
3599-
return
3600-
3601-
access_token = await token_storage.get_user_token(gateway_id, user_email)
3602-
3603-
if access_token:
3604-
headers["Authorization"] = f"Bearer {access_token}"
3605-
else:
3606-
if span:
3607-
set_span_attribute(span, "health.status", "unhealthy")
3608-
set_span_error(span, "No valid OAuth token for user")
3609-
await self._handle_gateway_failure(gateway)
3610-
return
3607+
if user_email:
3608+
with fresh_db_session() as token_db:
3609+
token_storage = TokenStorageService(token_db)
3610+
access_token = await token_storage.get_user_token(gateway_id, user_email)
36113611
except Exception as e:
3612-
logger.error(f"Failed to obtain stored OAuth token for gateway {gateway_name}: {e}")
3613-
if span:
3614-
set_span_attribute(span, "health.status", "unhealthy")
3615-
set_span_error(span, "Failed to obtain stored OAuth token")
3616-
await self._handle_gateway_failure(gateway)
3617-
return
3612+
logger.debug(f"Could not look up OAuth token for health check on {gateway_name}: {e}")
3613+
3614+
if access_token:
3615+
headers["Authorization"] = f"Bearer {access_token}"
3616+
else:
3617+
# No token — proceed without auth. The probe will
3618+
# likely get 401/403 which is fine (proves liveness).
3619+
unauthenticated_probe = True
3620+
logger.debug(
3621+
f"Health-checking authorization_code gateway "
3622+
f"{gateway_name} without auth (no system token)"
3623+
)
36183624
else:
36193625
# For Client Credentials flow, get token directly
36203626
try:
@@ -3636,12 +3642,17 @@ def get_httpx_client_factory(
36363642
else:
36373643
headers = {}
36383644

3639-
# Perform the GET and raise on 4xx/5xx
3645+
# Perform the actual connectivity probe.
3646+
# For unauthenticated probes (auth_code gateways without
3647+
# a system token), 401/403 proves the server is alive —
3648+
# only network errors indicate a real outage.
36403649
if (gateway_transport).lower() == "sse":
36413650
timeout = httpx.Timeout(settings.health_check_timeout)
36423651
async with client.stream("GET", gateway_url, headers=headers, timeout=timeout) as response:
3643-
# This will raise immediately if status is 4xx/5xx
3644-
response.raise_for_status()
3652+
if unauthenticated_probe and response.status_code in (401, 403):
3653+
logger.debug(f"Gateway {gateway_name} returned {response.status_code} (auth rejected, server alive)")
3654+
else:
3655+
response.raise_for_status()
36453656
if span:
36463657
set_span_attribute(span, "http.status_code", response.status_code)
36473658
elif (gateway_transport).lower() == "streamablehttp":
@@ -3698,7 +3709,10 @@ def get_httpx_client_factory(
36983709
}
36993710
timeout = httpx.Timeout(settings.health_check_timeout)
37003711
response = await client.post(gateway_url, json=init_payload, headers=init_headers, timeout=timeout)
3701-
response.raise_for_status()
3712+
if unauthenticated_probe and response.status_code in (401, 403):
3713+
logger.debug(f"Gateway {gateway_name} returned {response.status_code} (auth rejected, server alive)")
3714+
else:
3715+
response.raise_for_status()
37023716

37033717
# Reactivate gateway if it was previously inactive and health check passed now
37043718
if gateway_enabled and not gateway_reachable:
@@ -3776,6 +3790,8 @@ def get_httpx_client_factory(
37763790
if span:
37773791
set_span_attribute(span, "health.status", "healthy")
37783792
set_span_attribute(span, "success", True)
3793+
if unauthenticated_probe:
3794+
set_span_attribute(span, "health.probe_type", "unauthenticated_liveness")
37793795

37803796
except Exception as e:
37813797
if span:

0 commit comments

Comments
 (0)