Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 74 additions & 40 deletions mcpgateway/services/gateway_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3571,6 +3571,12 @@ def get_httpx_client_factory(
# Use isolated client for gateway health checks (each gateway may have custom CA cert)
# Use admin timeout for health checks (fail fast, don't wait 120s for slow upstreams)
# Pass ssl_context if present, otherwise let get_isolated_http_client use skip_ssl_verify setting
# Track whether this is an unauthenticated liveness probe
# (authorization_code gateway without system token). When True,
# HTTP 401/403 responses are treated as "server alive" rather
# than health-check failures.
unauthenticated_probe = False

async with get_isolated_http_client(timeout=settings.httpx_admin_read_timeout, verify=ssl_context) as client:
logger.debug(f"Checking health of gateway: {gateway_name} ({gateway_url_sanitized})")
try:
Expand All @@ -3581,40 +3587,40 @@ def get_httpx_client_factory(
grant_type = gateway_oauth_config.get("grant_type", "client_credentials")

if grant_type == "authorization_code":
# For Authorization Code flow, try to get stored tokens
# Authorization Code flow requires an interactive user
# to complete the OAuth dance. The health-check runs
# under a system identity (platform_admin_email) which
# typically has NO stored token for these gateways.
#
# Strategy: try to use a stored token if available;
# otherwise proceed WITHOUT auth. An unauthenticated
# probe still tests connectivity:
# - 401/403 β†’ server is alive (auth rejected, not down)
# - timeout/DNS/connection error β†’ real outage
# This avoids the old behaviour of marking the gateway
# as failed just because the system account has no token.
access_token = None
try:
# First-Party
from mcpgateway.services.token_storage_service import TokenStorageService # pylint: disable=import-outside-toplevel

# Use fresh session for OAuth token lookup
with fresh_db_session() as token_db:
token_storage = TokenStorageService(token_db)

# Get user-specific OAuth token
if not user_email:
if span:
set_span_attribute(span, "health.status", "unhealthy")
set_span_error(span, "User email required for OAuth token")
await self._handle_gateway_failure(gateway)
return

access_token = await token_storage.get_user_token(gateway_id, user_email)

if access_token:
headers["Authorization"] = f"Bearer {access_token}"
else:
if span:
set_span_attribute(span, "health.status", "unhealthy")
set_span_error(span, "No valid OAuth token for user")
await self._handle_gateway_failure(gateway)
return
if user_email:
with fresh_db_session() as token_db:
token_storage = TokenStorageService(token_db)
access_token = await token_storage.get_user_token(gateway_id, user_email)
except Exception as e:
logger.error(f"Failed to obtain stored OAuth token for gateway {gateway_name}: {e}")
if span:
set_span_attribute(span, "health.status", "unhealthy")
set_span_error(span, "Failed to obtain stored OAuth token")
await self._handle_gateway_failure(gateway)
return
logger.debug(f"Could not look up OAuth token for health check on {gateway_name}: {e}")

if access_token:
headers["Authorization"] = f"Bearer {access_token}"
else:
# No token β€” proceed without auth. The probe will
# likely get 401/403 which is fine (proves liveness).
unauthenticated_probe = True
logger.debug(
f"Health-checking authorization_code gateway "
f"{gateway_name} without auth (no system token)"
)
else:
# For Client Credentials flow, get token directly
try:
Expand All @@ -3636,12 +3642,17 @@ def get_httpx_client_factory(
else:
headers = {}

# Perform the GET and raise on 4xx/5xx
# Perform the actual connectivity probe.
# For unauthenticated probes (auth_code gateways without
# a system token), 401/403 proves the server is alive β€”
# only network errors indicate a real outage.
if (gateway_transport).lower() == "sse":
timeout = httpx.Timeout(settings.health_check_timeout)
async with client.stream("GET", gateway_url, headers=headers, timeout=timeout) as response:
# This will raise immediately if status is 4xx/5xx
response.raise_for_status()
if unauthenticated_probe and response.status_code in (401, 403):
logger.debug(f"Gateway {gateway_name} returned {response.status_code} (auth rejected, server alive)")
else:
response.raise_for_status()
if span:
set_span_attribute(span, "http.status_code", response.status_code)
elif (gateway_transport).lower() == "streamablehttp":
Expand Down Expand Up @@ -3673,14 +3684,35 @@ def get_httpx_client_factory(
with anyio.fail_after(settings.health_check_timeout):
await pooled.session.list_tools()
else:
async with streamablehttp_client(url=gateway_url, headers=headers, timeout=settings.health_check_timeout, httpx_client_factory=get_httpx_client_factory) as (
read_stream,
write_stream,
_get_session_id,
):
async with ClientSession(read_stream, write_stream) as session:
# Initialize the session
response = await session.initialize()
# Use a lightweight JSON-RPC POST ``initialize`` instead of the
# full SDK client. The SDK opens a GET SSE stream after
# initialize, which returns 405 on servers that don't support
# server-initiated messages (M365, Kubernetes MCP, GitHub).
# The MCP spec says GET is optional, so a successful POST
# ``initialize`` is sufficient proof of health.
# NOTE: ``ping`` would require an existing session, so
# ``initialize`` is the only stateless RPC we can send.
init_payload = {
"jsonrpc": "2.0",
"id": "health-check",
"method": "initialize",
"params": {
"protocolVersion": "2024-11-05",
"capabilities": {},
"clientInfo": {"name": "mcpgateway-health", "version": "1.0.0"},
},
}
init_headers = {
**headers,
"Content-Type": "application/json",
"Accept": "application/json, text/event-stream",
}
timeout = httpx.Timeout(settings.health_check_timeout)
response = await client.post(gateway_url, json=init_payload, headers=init_headers, timeout=timeout)
if unauthenticated_probe and response.status_code in (401, 403):
logger.debug(f"Gateway {gateway_name} returned {response.status_code} (auth rejected, server alive)")
else:
response.raise_for_status()

# Reactivate gateway if it was previously inactive and health check passed now
if gateway_enabled and not gateway_reachable:
Expand Down Expand Up @@ -3758,6 +3790,8 @@ def get_httpx_client_factory(
if span:
set_span_attribute(span, "health.status", "healthy")
set_span_attribute(span, "success", True)
if unauthenticated_probe:
set_span_attribute(span, "health.probe_type", "unauthenticated_liveness")

except Exception as e:
if span:
Expand Down