Skip to content

Commit e862fab

Browse files
committed
fix(active-checks): use HTTP/1.1 for health check probes with auto-fallback
Previously, active health check probes sent HTTP/1.0 requests. Upstream servers that only support HTTP/1.1 would respond with 426 (Upgrade Required), which is not in the default healthy/unhealthy status lists, causing health checks to silently become no-ops for those targets. Switch the default probe request from HTTP/1.0 to HTTP/1.1 with a Connection: close header. Add bidirectional version auto-detection in run_single_check() that automatically negotiates the HTTP version: - On 505 (HTTP Version Not Supported): retry with the other version - On 426 (Upgrade Required) while using HTTP/1.0: retry with HTTP/1.1 - On any non-healthy status with no cached version: retry with the other version to handle non-standard server implementations The working HTTP version is cached per-target in memory to avoid repeated retries. The cache self-heals when servers change their supported HTTP version. Refactored run_single_check() into focused helpers: - build_http_headers(): builds and caches serialized header string - establish_connection(): TCP connect + optional TLS handshake - probe_http(): sends HTTP request and returns status code FTI-7389 Signed-off-by: Walker Zhao <walker.zhao@konghq.com>
1 parent bcec8a7 commit e862fab

3 files changed

Lines changed: 1186 additions & 13 deletions

File tree

lib/resty/healthcheck.lua

Lines changed: 73 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1078,27 +1078,36 @@ local function establish_connection(self, ip, port, hostname, hostheader, typ)
10781078
end
10791079

10801080

1081-
-- Sends an HTTP/1.1 GET request over an already-connected socket and reports
1082-
-- the result via report_http_status / report_tcp_failure / report_timeout.
1083-
-- Connection: close is injected so the server closes the connection after
1084-
-- responding (health probes are one-shot).
1085-
local function probe_http(self, sock, ip, port, hostname, hostheader)
1081+
-- Sends an HTTP GET request over an already-connected socket.
1082+
-- Returns the parsed HTTP status code (number), or nil if a transport-level
1083+
-- error occurred (timeout / TCP failure are reported internally).
1084+
-- @param http_version "1.0" or "1.1" (default "1.1"). For "1.1",
1085+
-- Connection: close is injected so the server closes the connection after
1086+
-- responding (health probes are one-shot).
1087+
local function probe_http(self, sock, ip, port, hostname, hostheader, http_version)
10861088
local headers = build_http_headers(self)
10871089
local path = self.checks.active.http_path
10881090
local host = hostheader or hostname or ip
10891091

1090-
local request = ("GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n%s\r\n"):format(
1091-
path, host, headers)
1092+
local request
1093+
if http_version == "1.0" then
1094+
request = ("GET %s HTTP/1.0\r\n%sHost: %s\r\n\r\n"):format(path, headers, host)
1095+
else
1096+
request = ("GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n%s\r\n"):format(
1097+
path, host, headers)
1098+
end
10921099
self:log(DEBUG, "request head: ", request)
10931100

10941101
local bytes, err = sock:send(request)
10951102
if not bytes then
10961103
self:log(ERR, "failed to send http request to '", hostname, " (", ip, ":", port, ")': ", err)
10971104
if err == "timeout" then
10981105
sock:close() -- timeout errors do not close the socket.
1099-
return self:report_timeout(ip, port, hostname, "active")
1106+
self:report_timeout(ip, port, hostname, "active")
1107+
else
1108+
self:report_tcp_failure(ip, port, hostname, "send", "active")
11001109
end
1101-
return self:report_tcp_failure(ip, port, hostname, "send", "active")
1110+
return nil
11021111
end
11031112

11041113
local status_line
@@ -1107,9 +1116,11 @@ local function probe_http(self, sock, ip, port, hostname, hostheader)
11071116
self:log(ERR, "failed to receive status line from '", hostname, " (",ip, ":", port, ")': ", err)
11081117
if err == "timeout" then
11091118
sock:close() -- timeout errors do not close the socket.
1110-
return self:report_timeout(ip, port, hostname, "active")
1119+
self:report_timeout(ip, port, hostname, "active")
1120+
else
1121+
self:report_tcp_failure(ip, port, hostname, "receive", "active")
11111122
end
1112-
return self:report_tcp_failure(ip, port, hostname, "receive", "active")
1123+
return nil
11131124
end
11141125

11151126
local from, to = re_find(status_line,
@@ -1126,7 +1137,7 @@ local function probe_http(self, sock, ip, port, hostname, hostheader)
11261137
sock:close()
11271138

11281139
self:log(DEBUG, "Reporting '", hostname, " (", ip, ":", port, ")' (got HTTP ", status, ")")
1129-
return self:report_http_status(ip, port, hostname, status, "active")
1140+
return status
11301141
end
11311142

11321143

@@ -1144,7 +1155,56 @@ function checker:run_single_check(ip, port, hostname, hostheader)
11441155
return self:report_success(ip, port, hostname, "active")
11451156
end
11461157

1147-
return probe_http(self, sock, ip, port, hostname, hostheader)
1158+
-- Use cached version preference; default "1.1"
1159+
local target = get_target(self, ip, port, hostname)
1160+
local http_version = (target and target.http_version) or "1.1"
1161+
1162+
local status = probe_http(self, sock, ip, port, hostname, hostheader, http_version)
1163+
if not status then
1164+
return -- error already reported inside probe_http
1165+
end
1166+
1167+
-- Version auto-detection:
1168+
-- 1. 505 = server doesn't support our version -> try the other (always, for self-healing)
1169+
-- 2. 426 on HTTP/1.0 = server wants upgrade -> try 1.1 (always, for self-healing)
1170+
-- 3. Any non-healthy status when no version cached -> try the other (handles non-standard servers)
1171+
local has_cached_version = target and target.http_version ~= nil
1172+
local is_healthy = self.checks.active.healthy.http_statuses[status]
1173+
local should_retry = (status == 505)
1174+
or (status == 426 and http_version == "1.0")
1175+
or (not is_healthy and not has_cached_version)
1176+
1177+
if should_retry then
1178+
local other_version = (http_version == "1.0") and "1.1" or "1.0"
1179+
self:log(WARN, "target '", hostname or "", " (", ip, ":", port,
1180+
")' returned ", status, " on HTTP/", http_version,
1181+
", retrying with HTTP/", other_version)
1182+
1183+
sock = establish_connection(self, ip, port, hostname, hostheader, typ)
1184+
if not sock then
1185+
return -- failure already reported
1186+
end
1187+
1188+
local retry_status = probe_http(self, sock, ip, port, hostname, hostheader, other_version)
1189+
if not retry_status then
1190+
return -- error already reported
1191+
end
1192+
1193+
-- Always cache after retry to prevent repeated retries:
1194+
-- If retry gave a healthy result, the other version works — adopt it.
1195+
-- Otherwise, stick with the original version and its status so that
1196+
-- health reporting reflects the version we actually cache.
1197+
if target then
1198+
if self.checks.active.healthy.http_statuses[retry_status] then
1199+
target.http_version = other_version
1200+
status = retry_status
1201+
else
1202+
target.http_version = http_version
1203+
end
1204+
end
1205+
end
1206+
1207+
return self:report_http_status(ip, port, hostname, status, "active")
11481208
end
11491209

11501210
-- executes a work package (a list of checks) sequentially

0 commit comments

Comments
 (0)