From bcec8a75d5f899b12c86b4deaec878e7c96e1d1d Mon Sep 17 00:00:00 2001 From: Walker Zhao Date: Tue, 24 Mar 2026 17:12:29 +0800 Subject: [PATCH 1/4] fix(active-checks): use HTTP/1.1 for health check probes Previously, active health check probes sent HTTP/1.0 requests, which caused upstream servers that only support HTTP/1.1 to respond with 426 Upgrade Required, incorrectly marking healthy targets as unhealthy. Switch the default probe request from HTTP/1.0 to HTTP/1.1 with Connection: close header. HTTP/1.1 is backward-compatible with all servers that accept HTTP/1.0, so this change requires no configuration updates from users. Refactored run_single_check() into focused helpers: - build_http_headers(): builds and caches serialized header string - establish_connection(): TCP connect + optional TLS handshake - probe_http(): sends HTTP/1.1 GET and reports result FTI-7389 Signed-off-by: Walker Zhao --- lib/resty/healthcheck.lua | 145 ++++++++++++++++---------- t/with_resty-events/18-req-headers.t | 25 +++-- t/with_worker-events/18-req-headers.t | 25 +++-- 3 files changed, 120 insertions(+), 75 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index b4ff4255..a1cfb33a 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -985,60 +985,18 @@ end --============================================================================ --- Runs a single healthcheck probe -function checker:run_single_check(ip, port, hostname, hostheader) - - local sock, err = ngx.socket.tcp() - if not sock then - self:log(ERR, "failed to create stream socket: ", err) - return - end - - sock:settimeout(self.checks.active.timeout * 1000) - - local ok - ok, err = sock:connect(ip, port) - if not ok then - if err == "timeout" then - sock:close() -- timeout errors do not close the socket. - return self:report_timeout(ip, port, hostname, "active") - end - return self:report_tcp_failure(ip, port, hostname, "connect", "active") - end - - if self.checks.active.type == "tcp" then - sock:close() - return self:report_success(ip, port, hostname, "active") - end - - if self.checks.active.type == "https" then - local https_sni, session, err - https_sni = self.checks.active.https_sni or hostheader or hostname - if self.ssl_cert and self.ssl_key then - ok, err = sock:setclientcert(self.ssl_cert, self.ssl_key) - - if not ok then - self:log(ERR, "failed to set client certificate: ", err) - end - end - - session, err = sock:sslhandshake(nil, https_sni, - self.checks.active.https_verify_certificate) - - if not session then - sock:close() - self:log(ERR, "failed SSL handshake with '", hostname or "", " (", ip, ":", port, ")', using server name (sni) '", https_sni, "': ", err) - return self:report_tcp_failure(ip, port, hostname, "connect", "active") - end - +-- Builds and caches the serialized user-configured headers string for HTTP/1.x probes. +-- Uses ~= nil so that a cached empty string ("") is also a cache hit. +local function build_http_headers(self) + if self.checks.active._headers_str ~= nil then + return self.checks.active._headers_str end local req_headers = self.checks.active.headers local headers - if self.checks.active._headers_str then - headers = self.checks.active._headers_str - elseif req_headers == nil then - headers = "" + + if req_headers == nil then + headers = "" else local headers_length = nkeys(req_headers) if headers_length > 0 then @@ -1065,15 +1023,75 @@ function checker:run_single_check(ip, port, hostname, hostheader) headers = headers .. "\r\n" end end - self.checks.active._headers_str = headers or "" end + self.checks.active._headers_str = headers or "" + return self.checks.active._headers_str +end + + +-- Establishes a TCP connection and optionally performs a TLS handshake for +-- https type. Returns the connected socket, or nil when a failure has +-- already been reported via report_timeout / report_tcp_failure. +local function establish_connection(self, ip, port, hostname, hostheader, typ) + local sock, err = ngx.socket.tcp() + if not sock then + self:log(ERR, "failed to create stream socket: ", err) + return nil + end + + sock:settimeout(self.checks.active.timeout * 1000) + + local ok + ok, err = sock:connect(ip, port) + if not ok then + if err == "timeout" then + sock:close() -- timeout errors do not close the socket. + self:report_timeout(ip, port, hostname, "active") + else + self:report_tcp_failure(ip, port, hostname, "connect", "active") + end + return nil + end + + if typ == "https" then + local https_sni = self.checks.active.https_sni or hostheader or hostname + if self.ssl_cert and self.ssl_key then + ok, err = sock:setclientcert(self.ssl_cert, self.ssl_key) + if not ok then + self:log(ERR, "failed to set client certificate: ", err) + end + end + + local session + session, err = sock:sslhandshake(nil, https_sni, + self.checks.active.https_verify_certificate) + if not session then + sock:close() + self:log(ERR, "failed SSL handshake with '", hostname or "", " (", ip, ":", port, ")', using server name (sni) '", https_sni, "': ", err) + self:report_tcp_failure(ip, port, hostname, "connect", "active") + return nil + end + end + + return sock +end + + +-- Sends an HTTP/1.1 GET request over an already-connected socket and reports +-- the result via report_http_status / report_tcp_failure / report_timeout. +-- Connection: close is injected so the server closes the connection after +-- responding (health probes are one-shot). +local function probe_http(self, sock, ip, port, hostname, hostheader) + local headers = build_http_headers(self) local path = self.checks.active.http_path - local request = ("GET %s HTTP/1.0\r\n%sHost: %s\r\n\r\n"):format(path, headers, hostheader or hostname or ip) + local host = hostheader or hostname or ip + + local request = ("GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n%s\r\n"):format( + path, host, headers) self:log(DEBUG, "request head: ", request) - local bytes - bytes, err = sock:send(request) + local bytes, err = sock:send(request) if not bytes then self:log(ERR, "failed to send http request to '", hostname, " (", ip, ":", port, ")': ", err) if err == "timeout" then @@ -1108,10 +1126,27 @@ function checker:run_single_check(ip, port, hostname, hostheader) sock:close() self:log(DEBUG, "Reporting '", hostname, " (", ip, ":", port, ")' (got HTTP ", status, ")") - return self:report_http_status(ip, port, hostname, status, "active") end + +-- Runs a single healthcheck probe +function checker:run_single_check(ip, port, hostname, hostheader) + local typ = self.checks.active.type + + local sock = establish_connection(self, ip, port, hostname, hostheader, typ) + if not sock then + return -- failure already reported inside establish_connection + end + + if typ == "tcp" then + sock:close() + return self:report_success(ip, port, hostname, "active") + end + + return probe_http(self, sock, ip, port, hostname, hostheader) +end + -- executes a work package (a list of checks) sequentially function checker:run_work_package(work_package) for _, work_item in ipairs(work_package) do diff --git a/t/with_resty-events/18-req-headers.t b/t/with_resty-events/18-req-headers.t index 7fd69c33..51e6c1e1 100644 --- a/t/with_resty-events/18-req-headers.t +++ b/t/with_resty-events/18-req-headers.t @@ -79,9 +79,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl/7.29.0 +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl/7.29.0 @@ -128,9 +129,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl === TEST 3: headers: { ["User-Agent"] = "curl" } @@ -176,9 +178,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl @@ -225,9 +228,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl @@ -274,7 +278,8 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 +GET /status HTTP/1.1 +Host: 127.0.0.1 +Connection: close User-Agent: curl User-Agent: nginx -Host: 127.0.0.1 diff --git a/t/with_worker-events/18-req-headers.t b/t/with_worker-events/18-req-headers.t index 3c2433e2..e2a771cb 100644 --- a/t/with_worker-events/18-req-headers.t +++ b/t/with_worker-events/18-req-headers.t @@ -59,9 +59,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl/7.29.0 +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl/7.29.0 @@ -109,9 +110,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl === TEST 3: headers: { ["User-Agent"] = "curl" } @@ -158,9 +160,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl @@ -208,9 +211,10 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 -User-Agent: curl +GET /status HTTP/1.1 Host: 127.0.0.1 +Connection: close +User-Agent: curl @@ -258,7 +262,8 @@ true --- error_log checking healthy targets: nothing to do checking healthy targets: #1 -GET /status HTTP/1.0 +GET /status HTTP/1.1 +Host: 127.0.0.1 +Connection: close User-Agent: curl User-Agent: nginx -Host: 127.0.0.1 From e862fab8859c3644abbb1e5a9f4fd9b9edec750c Mon Sep 17 00:00:00 2001 From: Walker Zhao Date: Wed, 25 Mar 2026 16:52:48 +0800 Subject: [PATCH 2/4] fix(active-checks): use HTTP/1.1 for health check probes with auto-fallback Previously, active health check probes sent HTTP/1.0 requests. Upstream servers that only support HTTP/1.1 would respond with 426 (Upgrade Required), which is not in the default healthy/unhealthy status lists, causing health checks to silently become no-ops for those targets. Switch the default probe request from HTTP/1.0 to HTTP/1.1 with a Connection: close header. Add bidirectional version auto-detection in run_single_check() that automatically negotiates the HTTP version: - On 505 (HTTP Version Not Supported): retry with the other version - On 426 (Upgrade Required) while using HTTP/1.0: retry with HTTP/1.1 - On any non-healthy status with no cached version: retry with the other version to handle non-standard server implementations The working HTTP version is cached per-target in memory to avoid repeated retries. The cache self-heals when servers change their supported HTTP version. Refactored run_single_check() into focused helpers: - build_http_headers(): builds and caches serialized header string - establish_connection(): TCP connect + optional TLS handshake - probe_http(): sends HTTP request and returns status code FTI-7389 Signed-off-by: Walker Zhao --- lib/resty/healthcheck.lua | 86 ++- .../20-http-version-negotiation.t | 558 ++++++++++++++++++ .../20-http-version-negotiation.t | 555 +++++++++++++++++ 3 files changed, 1186 insertions(+), 13 deletions(-) create mode 100644 t/with_resty-events/20-http-version-negotiation.t create mode 100644 t/with_worker-events/20-http-version-negotiation.t diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index a1cfb33a..5d7ad74a 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1078,17 +1078,24 @@ local function establish_connection(self, ip, port, hostname, hostheader, typ) end --- Sends an HTTP/1.1 GET request over an already-connected socket and reports --- the result via report_http_status / report_tcp_failure / report_timeout. --- Connection: close is injected so the server closes the connection after --- responding (health probes are one-shot). -local function probe_http(self, sock, ip, port, hostname, hostheader) +-- Sends an HTTP GET request over an already-connected socket. +-- Returns the parsed HTTP status code (number), or nil if a transport-level +-- error occurred (timeout / TCP failure are reported internally). +-- @param http_version "1.0" or "1.1" (default "1.1"). For "1.1", +-- Connection: close is injected so the server closes the connection after +-- responding (health probes are one-shot). +local function probe_http(self, sock, ip, port, hostname, hostheader, http_version) local headers = build_http_headers(self) local path = self.checks.active.http_path local host = hostheader or hostname or ip - local request = ("GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n%s\r\n"):format( - path, host, headers) + local request + if http_version == "1.0" then + request = ("GET %s HTTP/1.0\r\n%sHost: %s\r\n\r\n"):format(path, headers, host) + else + request = ("GET %s HTTP/1.1\r\nHost: %s\r\nConnection: close\r\n%s\r\n"):format( + path, host, headers) + end self:log(DEBUG, "request head: ", request) local bytes, err = sock:send(request) @@ -1096,9 +1103,11 @@ local function probe_http(self, sock, ip, port, hostname, hostheader) self:log(ERR, "failed to send http request to '", hostname, " (", ip, ":", port, ")': ", err) if err == "timeout" then sock:close() -- timeout errors do not close the socket. - return self:report_timeout(ip, port, hostname, "active") + self:report_timeout(ip, port, hostname, "active") + else + self:report_tcp_failure(ip, port, hostname, "send", "active") end - return self:report_tcp_failure(ip, port, hostname, "send", "active") + return nil end local status_line @@ -1107,9 +1116,11 @@ local function probe_http(self, sock, ip, port, hostname, hostheader) self:log(ERR, "failed to receive status line from '", hostname, " (",ip, ":", port, ")': ", err) if err == "timeout" then sock:close() -- timeout errors do not close the socket. - return self:report_timeout(ip, port, hostname, "active") + self:report_timeout(ip, port, hostname, "active") + else + self:report_tcp_failure(ip, port, hostname, "receive", "active") end - return self:report_tcp_failure(ip, port, hostname, "receive", "active") + return nil end local from, to = re_find(status_line, @@ -1126,7 +1137,7 @@ local function probe_http(self, sock, ip, port, hostname, hostheader) sock:close() self:log(DEBUG, "Reporting '", hostname, " (", ip, ":", port, ")' (got HTTP ", status, ")") - return self:report_http_status(ip, port, hostname, status, "active") + return status end @@ -1144,7 +1155,56 @@ function checker:run_single_check(ip, port, hostname, hostheader) return self:report_success(ip, port, hostname, "active") end - return probe_http(self, sock, ip, port, hostname, hostheader) + -- Use cached version preference; default "1.1" + local target = get_target(self, ip, port, hostname) + local http_version = (target and target.http_version) or "1.1" + + local status = probe_http(self, sock, ip, port, hostname, hostheader, http_version) + if not status then + return -- error already reported inside probe_http + end + + -- Version auto-detection: + -- 1. 505 = server doesn't support our version -> try the other (always, for self-healing) + -- 2. 426 on HTTP/1.0 = server wants upgrade -> try 1.1 (always, for self-healing) + -- 3. Any non-healthy status when no version cached -> try the other (handles non-standard servers) + local has_cached_version = target and target.http_version ~= nil + local is_healthy = self.checks.active.healthy.http_statuses[status] + local should_retry = (status == 505) + or (status == 426 and http_version == "1.0") + or (not is_healthy and not has_cached_version) + + if should_retry then + local other_version = (http_version == "1.0") and "1.1" or "1.0" + self:log(WARN, "target '", hostname or "", " (", ip, ":", port, + ")' returned ", status, " on HTTP/", http_version, + ", retrying with HTTP/", other_version) + + sock = establish_connection(self, ip, port, hostname, hostheader, typ) + if not sock then + return -- failure already reported + end + + local retry_status = probe_http(self, sock, ip, port, hostname, hostheader, other_version) + if not retry_status then + return -- error already reported + end + + -- Always cache after retry to prevent repeated retries: + -- If retry gave a healthy result, the other version works — adopt it. + -- Otherwise, stick with the original version and its status so that + -- health reporting reflects the version we actually cache. + if target then + if self.checks.active.healthy.http_statuses[retry_status] then + target.http_version = other_version + status = retry_status + else + target.http_version = http_version + end + end + end + + return self:report_http_status(ip, port, hostname, status, "active") end -- executes a work package (a list of checks) sequentially diff --git a/t/with_resty-events/20-http-version-negotiation.t b/t/with_resty-events/20-http-version-negotiation.t new file mode 100644 index 00000000..375421f8 --- /dev/null +++ b/t/with_resty-events/20-http-version-negotiation.t @@ -0,0 +1,558 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * 38; + +my $pwd = cwd(); +$ENV{TEST_NGINX_SERVROOT} = server_root(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + + init_worker_by_lua_block { + local we = require "resty.events.compat" + assert(we.configure({ + unique_timeout = 5, + broker_id = 0, + listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock" + })) + assert(we.configured()) + } + + server { + server_name kong_worker_events; + listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock; + access_log off; + location / { + content_by_lua_block { + require("resty.events.compat").run() + } + } + } +}; + +run_tests(); + +__DATA__ + + + +=== TEST 1: 505 auto-fallback, HTTP/1.0-only server, target stays healthy +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(505) + end + ngx.exit(200) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +returned 505 on HTTP/1.1, retrying with HTTP/1.0 + + + +=== TEST 2: 426 auto-upgrade, server switches from HTTP/1.0-only to HTTP/1.1-only +--- timeout: 5 +--- http_config eval +qq{ + $::HttpConfig + + lua_shared_dict mock_state 1m; + + server { + listen 2114; + location = /status { + content_by_lua_block { + local phase = ngx.shared.mock_state:get("phase") or 1 + if phase == 1 then + -- Phase 1: only accept HTTP/1.0 + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(505) + end + ngx.exit(200) + else + -- Phase 2: only accept HTTP/1.1 + if ngx.var.server_protocol ~= "HTTP/1.1" then + return ngx.exit(426) + end + ngx.exit(200) + end + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + -- Phase 1: server only supports HTTP/1.0 + ngx.sleep(0.6) + local status1 = checker:get_target_status("127.0.0.1", 2114) + ngx.say("phase1: ", status1) -- true (healthy via 1.0 fallback) + + -- Switch to phase 2: server now only supports HTTP/1.1 + ngx.shared.mock_state:set("phase", 2) + ngx.sleep(0.6) + local status2 = checker:get_target_status("127.0.0.1", 2114) + ngx.say("phase2: ", status2) -- true (healthy via 1.1 upgrade) + } + } +--- request +GET /t +--- response_body +phase1: true +phase2: true +--- error_log +returned 505 on HTTP/1.1, retrying with HTTP/1.0 +returned 426 on HTTP/1.0, retrying with HTTP/1.1 + + + +=== TEST 3: version caching, no repeated retries after stabilization +--- timeout: 5 +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(505) + end + ngx.exit(200) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + -- Wait long enough for multiple check intervals so caching takes effect + ngx.sleep(1.5) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log eval +[ + qr/returned 505 on HTTP\/1\.1, retrying with HTTP\/1\.0/, + qr/healthy SUCCESS increment/, +] +--- no_error_log eval +qr/returned 505 on HTTP\/1\.1, retrying with HTTP\/1\.0.*returned 505 on HTTP\/1\.1, retrying with HTTP\/1\.0/s + + + +=== TEST 4: permanent failure, both versions return 505, target goes unhealthy +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 505; + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false + } + } +--- request +GET /t +--- response_body +false +--- error_log +returned 505 on HTTP/1.1, retrying with HTTP/1.0 +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' + + + +=== TEST 5: normal HTTP/1.1, no version negotiation triggered +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 200; + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- no_error_log +retrying with HTTP/ + + + +=== TEST 6: 426 on HTTP/1.1, retried once then cached (server wants HTTP/2) +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 426; + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + -- 426 is not in the default healthy or unhealthy lists, + -- so status should remain unchanged (still true) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +retrying with HTTP/ + + + +=== TEST 7: non-standard server, returns 400 on HTTP/1.1, 200 on HTTP/1.0 +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(400) + end + ngx.exit(200) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +returned 400 on HTTP/1.1, retrying with HTTP/1.0 + + + +=== TEST 8: genuinely unhealthy server (500 on both versions), retries once then caches +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 500; + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false + } + } +--- request +GET /t +--- response_body +false +--- error_log +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' + + + +=== TEST 9: failed retry reports original status, not retry status +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(418) + end + ngx.exit(500) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + -- 418 (original) is not in any list, so target stays healthy. + -- Without the fix, retry's 500 would be reported and cause + -- unhealthy increments. + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +returned 418 on HTTP/1.1, retrying with HTTP/1.0 +--- no_error_log +unhealthy HTTP increment diff --git a/t/with_worker-events/20-http-version-negotiation.t b/t/with_worker-events/20-http-version-negotiation.t new file mode 100644 index 00000000..07cf98e4 --- /dev/null +++ b/t/with_worker-events/20-http-version-negotiation.t @@ -0,0 +1,555 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * 38; + +my $pwd = cwd(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + lua_shared_dict my_worker_events 8m; +}; + +run_tests(); + +__DATA__ + + + +=== TEST 1: 505 auto-fallback, HTTP/1.0-only server, target stays healthy +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(505) + end + ngx.exit(200) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +returned 505 on HTTP/1.1, retrying with HTTP/1.0 + + + +=== TEST 2: 426 auto-upgrade, server switches from HTTP/1.0-only to HTTP/1.1-only +--- timeout: 5 +--- http_config eval +qq{ + $::HttpConfig + + lua_shared_dict mock_state 1m; + + server { + listen 2114; + location = /status { + content_by_lua_block { + local phase = ngx.shared.mock_state:get("phase") or 1 + if phase == 1 then + -- Phase 1: only accept HTTP/1.0 + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(505) + end + ngx.exit(200) + else + -- Phase 2: only accept HTTP/1.1 + if ngx.var.server_protocol ~= "HTTP/1.1" then + return ngx.exit(426) + end + ngx.exit(200) + end + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + -- Phase 1: server only supports HTTP/1.0 + ngx.sleep(0.6) + local status1 = checker:get_target_status("127.0.0.1", 2114) + ngx.say("phase1: ", status1) -- true (healthy via 1.0 fallback) + + -- Switch to phase 2: server now only supports HTTP/1.1 + ngx.shared.mock_state:set("phase", 2) + ngx.sleep(0.6) + local status2 = checker:get_target_status("127.0.0.1", 2114) + ngx.say("phase2: ", status2) -- true (healthy via 1.1 upgrade) + } + } +--- request +GET /t +--- response_body +phase1: true +phase2: true +--- error_log +returned 505 on HTTP/1.1, retrying with HTTP/1.0 +returned 426 on HTTP/1.0, retrying with HTTP/1.1 + + + +=== TEST 3: version caching, no repeated retries after stabilization +--- timeout: 5 +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(505) + end + ngx.exit(200) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + -- Wait long enough for multiple check intervals so caching takes effect + ngx.sleep(1.5) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log eval +[ + qr/returned 505 on HTTP\/1\.1, retrying with HTTP\/1\.0/, + qr/healthy SUCCESS increment/, +] +--- no_error_log eval +qr/returned 505 on HTTP\/1\.1, retrying with HTTP\/1\.0.*returned 505 on HTTP\/1\.1, retrying with HTTP\/1\.0/s + + + +=== TEST 4: permanent failure, both versions return 505, target goes unhealthy +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 505; + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false + } + } +--- request +GET /t +--- response_body +false +--- error_log +returned 505 on HTTP/1.1, retrying with HTTP/1.0 +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' + + + +=== TEST 5: normal HTTP/1.1, no version negotiation triggered +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 200; + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- no_error_log +retrying with HTTP/ + + + +=== TEST 6: 426 on HTTP/1.1, retried once then cached (server wants HTTP/2) +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 426; + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + -- 426 is not in the default healthy or unhealthy lists, + -- so status should remain unchanged (still true) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +retrying with HTTP/ + + + +=== TEST 7: non-standard server, returns 400 on HTTP/1.1, 200 on HTTP/1.0 +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(400) + end + ngx.exit(200) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 1, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +returned 400 on HTTP/1.1, retrying with HTTP/1.0 + + + +=== TEST 8: genuinely unhealthy server (500 on both versions), retries once then caches +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + return 500; + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false + } + } +--- request +GET /t +--- response_body +false +--- error_log +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' +event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' + + + +=== TEST 9: failed retry reports original status, not retry status +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2114; + location = /status { + content_by_lua_block { + if ngx.var.server_protocol == "HTTP/1.1" then + return ngx.exit(418) + end + ngx.exit(500) + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + events_module = "resty.worker.events", + type = "http", + checks = { + active = { + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + ngx.sleep(2) + local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) + ngx.sleep(0.6) + -- 418 (original) is not in any list, so target stays healthy. + -- Without the fix, retry's 500 would be reported and cause + -- unhealthy increments. + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + } + } +--- request +GET /t +--- response_body +true +--- error_log +returned 418 on HTTP/1.1, retrying with HTTP/1.0 +--- no_error_log +unhealthy HTTP increment From e71b857423140a878c50b633cf2baf1ee595e24e Mon Sep 17 00:00:00 2001 From: Walker Zhao Date: Thu, 26 Mar 2026 16:54:51 +0800 Subject: [PATCH 3/4] refactor(active-checks): extract negotiate_http_version() and simplify retry logic Extract the HTTP version negotiation logic from run_single_check() into a dedicated negotiate_http_version() local function, reducing run_single_check() from ~85 lines to ~20 lines with flat control flow. Remove the unknown-status retry trigger and http_version_locked flag, keeping only 3 clear retry triggers: 505, 426-on-1.0, and non-healthy-with-no-cache. Unknown status codes are silently ignored by report_http_status(), matching the original behavior. Fix bad status line handling: probe_http() now returns 0 instead of nil for malformed HTTP responses, preserving the original behavior where report_http_status() treats 0 as unhealthy. Previously, the refactor caused these to be silently ignored. FTI-7389 Signed-off-by: Walker Zhao --- lib/resty/healthcheck.lua | 106 +++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 42 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 5d7ad74a..f1295cb8 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1131,7 +1131,7 @@ local function probe_http(self, sock, ip, port, hostname, hostheader, http_versi status = tonumber(status_line:sub(from, to)) else self:log(ERR, "bad status line from '", hostname, " (", ip, ":", port, ")': ", status_line) - -- note: 'status' will be reported as 'nil' + status = 0 -- report_http_status treats 0 as unhealthy end sock:close() @@ -1141,13 +1141,70 @@ local function probe_http(self, sock, ip, port, hostname, hostheader, http_versi end +-- Negotiates the HTTP version for a target based on the probe result. +-- If the status suggests a version mismatch, retries with the other version. +-- Updates the target's cached version preference. +-- Returns the final status to report, or nil if a transport error occurred. +local function negotiate_http_version(self, target, ip, port, hostname, + hostheader, typ, http_version, status) + local is_healthy = self.checks.active.healthy.http_statuses[status] + local has_cached_version = target and target.http_version ~= nil + + -- Version auto-detection: + -- 1. 505 -> try the other version (always, for self-healing) + -- 2. 426 on HTTP/1.0 -> try HTTP/1.1 (always, for self-healing) + -- 3. Any non-healthy + no cache -> try the other (first-probe discovery) + local should_retry = (status == 505) + or (status == 426 and http_version == "1.0") + or (not is_healthy and not has_cached_version) + + if not should_retry then + return status + end + + local other_version = (http_version == "1.0") and "1.1" or "1.0" + self:log(WARN, "target '", hostname or "", " (", ip, ":", port, + ")' returned ", status, " on HTTP/", http_version, + ", retrying with HTTP/", other_version) + + local sock = establish_connection(self, ip, port, hostname, hostheader, typ) + if not sock then + return nil + end + + local retry_status = probe_http(self, sock, ip, port, hostname, hostheader, other_version) + if not retry_status then + return nil + end + + -- Decide which status to report and cache the version preference. + -- If retry gave a healthy result, the other version works — adopt it. + -- Otherwise, stick with the original version and its status so that + -- health reporting reflects the version we actually cache. + local retry_is_healthy = self.checks.active.healthy.http_statuses[retry_status] + local final_status + + if target then + if retry_is_healthy then + final_status = retry_status + target.http_version = other_version + else + final_status = status + target.http_version = http_version + end + end + + return final_status +end + + -- Runs a single healthcheck probe function checker:run_single_check(ip, port, hostname, hostheader) local typ = self.checks.active.type local sock = establish_connection(self, ip, port, hostname, hostheader, typ) if not sock then - return -- failure already reported inside establish_connection + return end if typ == "tcp" then @@ -1155,53 +1212,18 @@ function checker:run_single_check(ip, port, hostname, hostheader) return self:report_success(ip, port, hostname, "active") end - -- Use cached version preference; default "1.1" local target = get_target(self, ip, port, hostname) local http_version = (target and target.http_version) or "1.1" local status = probe_http(self, sock, ip, port, hostname, hostheader, http_version) if not status then - return -- error already reported inside probe_http + return end - -- Version auto-detection: - -- 1. 505 = server doesn't support our version -> try the other (always, for self-healing) - -- 2. 426 on HTTP/1.0 = server wants upgrade -> try 1.1 (always, for self-healing) - -- 3. Any non-healthy status when no version cached -> try the other (handles non-standard servers) - local has_cached_version = target and target.http_version ~= nil - local is_healthy = self.checks.active.healthy.http_statuses[status] - local should_retry = (status == 505) - or (status == 426 and http_version == "1.0") - or (not is_healthy and not has_cached_version) - - if should_retry then - local other_version = (http_version == "1.0") and "1.1" or "1.0" - self:log(WARN, "target '", hostname or "", " (", ip, ":", port, - ")' returned ", status, " on HTTP/", http_version, - ", retrying with HTTP/", other_version) - - sock = establish_connection(self, ip, port, hostname, hostheader, typ) - if not sock then - return -- failure already reported - end - - local retry_status = probe_http(self, sock, ip, port, hostname, hostheader, other_version) - if not retry_status then - return -- error already reported - end - - -- Always cache after retry to prevent repeated retries: - -- If retry gave a healthy result, the other version works — adopt it. - -- Otherwise, stick with the original version and its status so that - -- health reporting reflects the version we actually cache. - if target then - if self.checks.active.healthy.http_statuses[retry_status] then - target.http_version = other_version - status = retry_status - else - target.http_version = http_version - end - end + status = negotiate_http_version(self, target, ip, port, hostname, + hostheader, typ, http_version, status) + if not status then + return end return self:report_http_status(ip, port, hostname, status, "active") From d7244d46ceaec5f8bf7cd0ae773c3bb7384ef9e4 Mon Sep 17 00:00:00 2001 From: Walker Zhao Date: Wed, 8 Apr 2026 16:31:22 +0800 Subject: [PATCH 4/4] fix(active-checks): narrow version negotiation retry to only 505 and 426 status codes Remove overly broad trigger that retried on any non-healthy status for uncached targets, which caused unnecessary retry connections for genuinely unhealthy servers (e.g., 500). Now only standard HTTP version negotiation codes (505 HTTP Version Not Supported, 426 Upgrade Required) trigger a retry, gated on `not is_healthy` to respect user-configured healthy status lists while preserving self-healing for cached targets. Signed-off-by: Walker Zhao --- lib/resty/healthcheck.lua | 19 ++++---- .../20-http-version-negotiation.t | 43 +++++++++++-------- .../20-http-version-negotiation.t | 43 +++++++++++-------- 3 files changed, 58 insertions(+), 47 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index f1295cb8..8da9e21a 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1148,15 +1148,16 @@ end local function negotiate_http_version(self, target, ip, port, hostname, hostheader, typ, http_version, status) local is_healthy = self.checks.active.healthy.http_statuses[status] - local has_cached_version = target and target.http_version ~= nil - - -- Version auto-detection: - -- 1. 505 -> try the other version (always, for self-healing) - -- 2. 426 on HTTP/1.0 -> try HTTP/1.1 (always, for self-healing) - -- 3. Any non-healthy + no cache -> try the other (first-probe discovery) - local should_retry = (status == 505) - or (status == 426 and http_version == "1.0") - or (not is_healthy and not has_cached_version) + + -- Version auto-detection (only for standard HTTP version codes): + -- 1. 505 (HTTP Version Not Supported) -> try the other version + -- 2. 426 (Upgrade Required) on HTTP/1.0 -> try HTTP/1.1 + -- Both triggers are gated on `not is_healthy` to respect user configuration. + -- Both triggers fire regardless of cache state, enabling self-healing when + -- a server changes its supported HTTP version. + local should_retry = not is_healthy and + ((status == 505) or + (status == 426 and http_version == "1.0")) if not should_retry then return status diff --git a/t/with_resty-events/20-http-version-negotiation.t b/t/with_resty-events/20-http-version-negotiation.t index 375421f8..cb25164c 100644 --- a/t/with_resty-events/20-http-version-negotiation.t +++ b/t/with_resty-events/20-http-version-negotiation.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * 38; +plan tests => repeat_each() * 41; my $pwd = cwd(); $ENV{TEST_NGINX_SERVROOT} = server_root(); @@ -339,7 +339,7 @@ retrying with HTTP/ -=== TEST 6: 426 on HTTP/1.1, retried once then cached (server wants HTTP/2) +=== TEST 6: 426 on HTTP/1.1, no retry (downgrading to 1.0 won't help) --- http_config eval qq{ $::HttpConfig @@ -386,12 +386,12 @@ qq{ GET /t --- response_body true ---- error_log +--- no_error_log retrying with HTTP/ -=== TEST 7: non-standard server, returns 400 on HTTP/1.1, 200 on HTTP/1.0 +=== TEST 7: non-standard server returns 400 on HTTP/1.1, no retry (only 505/426 trigger retry) --- http_config eval qq{ $::HttpConfig @@ -434,19 +434,20 @@ qq{ ngx.sleep(2) local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) ngx.sleep(0.6) - ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + -- 400 is not in healthy or unhealthy lists, no retry, status unchanged + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false } } --- request GET /t --- response_body -true ---- error_log -returned 400 on HTTP/1.1, retrying with HTTP/1.0 +false +--- no_error_log +retrying with HTTP/ -=== TEST 8: genuinely unhealthy server (500 on both versions), retries once then caches +=== TEST 8: genuinely unhealthy server (500), no version retry --- http_config eval qq{ $::HttpConfig @@ -496,6 +497,8 @@ unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' +--- no_error_log +retrying with HTTP/ @@ -509,9 +512,10 @@ qq{ location = /status { content_by_lua_block { if ngx.var.server_protocol == "HTTP/1.1" then - return ngx.exit(418) + return ngx.exit(505) end - ngx.exit(500) + -- Retry with HTTP/1.0 also fails (418 is not in any list) + ngx.exit(418) } } } @@ -542,17 +546,18 @@ qq{ ngx.sleep(2) local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) ngx.sleep(0.6) - -- 418 (original) is not in any list, so target stays healthy. - -- Without the fix, retry's 500 would be reported and cause - -- unhealthy increments. - ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + -- Retry gets 418 (not healthy), so original 505 is reported. + -- 505 is in the default unhealthy list, causing unhealthy increments. + -- If retry's 418 were incorrectly reported, no increments would happen. + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false } } --- request GET /t --- response_body -true +false --- error_log -returned 418 on HTTP/1.1, retrying with HTTP/1.0 ---- no_error_log -unhealthy HTTP increment +returned 505 on HTTP/1.1, retrying with HTTP/1.0 +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' diff --git a/t/with_worker-events/20-http-version-negotiation.t b/t/with_worker-events/20-http-version-negotiation.t index 07cf98e4..56102735 100644 --- a/t/with_worker-events/20-http-version-negotiation.t +++ b/t/with_worker-events/20-http-version-negotiation.t @@ -3,7 +3,7 @@ use Cwd qw(cwd); workers(1); -plan tests => repeat_each() * 38; +plan tests => repeat_each() * 41; my $pwd = cwd(); @@ -328,7 +328,7 @@ retrying with HTTP/ -=== TEST 6: 426 on HTTP/1.1, retried once then cached (server wants HTTP/2) +=== TEST 6: 426 on HTTP/1.1, no retry (downgrading to 1.0 won't help) --- http_config eval qq{ $::HttpConfig @@ -377,12 +377,12 @@ qq{ GET /t --- response_body true ---- error_log +--- no_error_log retrying with HTTP/ -=== TEST 7: non-standard server, returns 400 on HTTP/1.1, 200 on HTTP/1.0 +=== TEST 7: non-standard server returns 400 on HTTP/1.1, no retry (only 505/426 trigger retry) --- http_config eval qq{ $::HttpConfig @@ -427,19 +427,20 @@ qq{ ngx.sleep(2) local ok, err = checker:add_target("127.0.0.1", 2114, nil, false) ngx.sleep(0.6) - ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + -- 400 is not in healthy or unhealthy lists, no retry, status unchanged + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false } } --- request GET /t --- response_body -true ---- error_log -returned 400 on HTTP/1.1, retrying with HTTP/1.0 +false +--- no_error_log +retrying with HTTP/ -=== TEST 8: genuinely unhealthy server (500 on both versions), retries once then caches +=== TEST 8: genuinely unhealthy server (500), no version retry --- http_config eval qq{ $::HttpConfig @@ -491,6 +492,8 @@ unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)' event: target status '127.0.0.1(127.0.0.1:2114)' from 'true' to 'false' +--- no_error_log +retrying with HTTP/ @@ -504,9 +507,10 @@ qq{ location = /status { content_by_lua_block { if ngx.var.server_protocol == "HTTP/1.1" then - return ngx.exit(418) + return ngx.exit(505) end - ngx.exit(500) + -- Retry with HTTP/1.0 also fails (418 is not in any list) + ngx.exit(418) } } } @@ -539,17 +543,18 @@ qq{ ngx.sleep(2) local ok, err = checker:add_target("127.0.0.1", 2114, nil, true) ngx.sleep(0.6) - -- 418 (original) is not in any list, so target stays healthy. - -- Without the fix, retry's 500 would be reported and cause - -- unhealthy increments. - ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- true + -- Retry gets 418 (not healthy), so original 505 is reported. + -- 505 is in the default unhealthy list, causing unhealthy increments. + -- If retry's 418 were incorrectly reported, no increments would happen. + ngx.say(checker:get_target_status("127.0.0.1", 2114)) -- false } } --- request GET /t --- response_body -true +false --- error_log -returned 418 on HTTP/1.1, retrying with HTTP/1.0 ---- no_error_log -unhealthy HTTP increment +returned 505 on HTTP/1.1, retrying with HTTP/1.0 +unhealthy HTTP increment (1/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (2/3) for '127.0.0.1(127.0.0.1:2114)' +unhealthy HTTP increment (3/3) for '127.0.0.1(127.0.0.1:2114)'