Skip to content

Commit 60f7b48

Browse files
authored
fix: ensure one healthchecker runs at a time (#14)
1 parent d70756a commit 60f7b48

File tree

1 file changed

+29
-20
lines changed

1 file changed

+29
-20
lines changed

lib/resty/rediscluster.lua

+29-20
Original file line numberDiff line numberDiff line change
@@ -30,27 +30,27 @@ local DEFAULT_SEND_TIMEOUT = 1000
3030
local DEFAULT_READ_TIMEOUT = 1000
3131
local DEFAULT_HEALTH_DICT_NAME = "redis_cluster_health"
3232
local err_unhealthy_master = "master node is unhealthy"
33-
33+
local health_check_running = false
3434
local function generate_key(name, ip, port)
3535
return name .. ":" .. ip .. ":" .. port
3636
end
3737

3838

39-
local function health_check_timer(premature)
39+
local function run_health_check(premature)
4040
if premature then
4141
return
4242
end
4343

44-
local health_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
45-
if not health_dict then
44+
local unhealthy_nodes_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
45+
if not unhealthy_nodes_dict then
4646
return
4747
end
4848

49-
local all_keys = health_dict:get_keys()
49+
local all_keys = unhealthy_nodes_dict:get_keys()
5050
for _, key in ipairs(all_keys) do
5151
local ip, port = string.match(key, "^[^:]+:([^:]+):(%d+)$")
5252
if not ip or not port then
53-
health_dict:delete(key)
53+
unhealthy_nodes_dict:delete(key)
5454
goto continue
5555
end
5656
port = tonumber(port)
@@ -72,25 +72,26 @@ local function health_check_timer(premature)
7272
end
7373
-- Update health status based on check
7474
if ok then
75-
health_dict:delete(key)
75+
unhealthy_nodes_dict:delete(key)
7676
ngx.log(ngx.WARN, "health check success for: ", ip, ":", port)
7777
else
78-
local failures = health_dict:get(key) or 0
79-
health_dict:set(key, failures + 1, 60) -- Unhealthy: increment failures with TTL
80-
ngx.log(ngx.WARN, "health check failed for: ", ip, ":", port, "failures: ", failures + 1)
78+
local failures = unhealthy_nodes_dict:get(key) or 0
79+
unhealthy_nodes_dict:set(key, failures + 1, 60) -- Unhealthy: increment failures with TTL
80+
ngx.log(ngx.ERR, "health check failed for: ", ip, ":", port, " - failures: ", failures + 1)
8181
end
8282

8383
::continue::
8484
end
8585
end
8686

87+
8788
local function track_node_failure(ip, port, name)
88-
local health_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
89-
if not health_dict then
89+
local unhealthy_nodes_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
90+
if not unhealthy_nodes_dict then
9091
return
9192
end
9293
local key = generate_key(name, ip, port)
93-
health_dict:incr(key, 1, 0, 60)
94+
unhealthy_nodes_dict:incr(key, 1, 0, 60)
9495
end
9596

9697
local function parse_key(key_str)
@@ -113,15 +114,23 @@ local slot_cache = {}
113114
local master_nodes = {}
114115

115116

117+
local function health_check(premature)
118+
if health_check_running then
119+
return
120+
end
121+
health_check_running = true
122+
pcall(run_health_check, premature)
123+
health_check_running = false
124+
end
116125

117126
local function is_node_healthy(ip, port, name)
118-
local health_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
119-
if not health_dict then
127+
local unhealthy_nodes_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
128+
if not unhealthy_nodes_dict then
120129
return true
121130
end
122131

123132
local key = generate_key(name, ip, port)
124-
local is_healthy = (health_dict:get(key) or 0) <= 3
133+
local is_healthy = (unhealthy_nodes_dict:get(key) or 0) <= 3
125134
return is_healthy
126135
end
127136

@@ -342,18 +351,18 @@ function _M.refresh_slots(self)
342351

343352
self:fetch_slots()
344353
-- Cleanup health dict entries for removed nodes
345-
local health_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
354+
local unhealthy_nodes_dict = ngx.shared[DEFAULT_HEALTH_DICT_NAME]
346355
local current_nodes = {}
347356
local servers = slot_cache[self.config.name .. "serv_list"].serv_list
348357
for _, node in ipairs(servers) do
349358
local key = generate_key(self.config.name, node.ip, node.port)
350359
current_nodes[key] = true
351360
end
352361
-- Cleanup stale nodes
353-
local all_keys = health_dict:get_keys()
362+
local all_keys = unhealthy_nodes_dict:get_keys()
354363
for _, key in ipairs(all_keys) do
355364
if not current_nodes[key] then
356-
health_dict:delete(key)
365+
unhealthy_nodes_dict:delete(key)
357366
end
358367
end
359368

@@ -920,7 +929,7 @@ setmetatable(_M, {
920929
})
921930

922931
function _M.init()
923-
ngx.timer.every(1, health_check_timer)
932+
ngx.timer.every(1, health_check)
924933
end
925934

926935
return _M

0 commit comments

Comments
 (0)