Skip to content

Commit 3701442

Browse files
committed
fix(*): avoid creating multiple timers to run the same active check (#156)
1 parent a22ac5a commit 3701442

File tree

5 files changed

+231
-12
lines changed

5 files changed

+231
-12
lines changed

lib/resty/healthcheck.lua

+48
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ local table_insert = table.insert
3434
local table_remove = table.remove
3535
local table_concat = table.concat
3636
local string_format = string.format
37+
local math_max = math.max
3738
local ssl = require("ngx.ssl")
3839
local resty_timer = require "resty.timer"
3940
local bit = require("bit")
@@ -1201,6 +1202,32 @@ local function renew_periodic_lock(shm, key)
12011202
end
12021203

12031204

1205+
local function get_callback_lock(shm, key, ttl)
1206+
local value = shm:get(key)
1207+
if value == nil then
1208+
-- no worker is checking, try to acquire the lock
1209+
local ok, err = shm:add(key, true, ttl or LOCK_PERIOD)
1210+
if not ok then
1211+
if err == "exists" then
1212+
-- another worker got the lock before
1213+
return false
1214+
end
1215+
1216+
return nil, err
1217+
end
1218+
1219+
return true
1220+
end
1221+
1222+
return false
1223+
end
1224+
1225+
1226+
local function remove_callback_lock(shm, key)
1227+
return shm:delete(key)
1228+
end
1229+
1230+
12041231
--- Active health check callback function.
12051232
-- @param self the checker object this timer runs on
12061233
-- @param health_mode either "healthy" or "unhealthy" to indicate what check
@@ -1209,10 +1236,27 @@ local function checker_callback(self, health_mode)
12091236
self.checker_callback_count = self.checker_callback_count + 1
12101237
end
12111238

1239+
-- Set a callback pending lock will exist for 2x the time of the active check.
1240+
-- An active check should be finished within this time and next timer will be
1241+
-- executed to exit a pending status.
1242+
local callback_pending_ttl = (math_max(self.checks.active.healthy.active and
1243+
self.checks.active.healthy.interval or 0,
1244+
self.checks.active.unhealthy.active and
1245+
self.checks.active.unhealthy.interval or 0)
1246+
+ self.checks.active.timeout) * 2
1247+
1248+
local callback_lock = self.CALLBACK_LOCK .. health_mode
1249+
-- a pending timer already exists, so skip this time
1250+
local ok, _ = get_callback_lock(self.shm, callback_lock, callback_pending_ttl)
1251+
if not ok then
1252+
return
1253+
end
1254+
12121255
local list_to_check = {}
12131256
local targets, err = fetch_target_list(self)
12141257
if not targets then
12151258
self:log(ERR, "checker_callback: ", err)
1259+
remove_callback_lock(self.shm, callback_lock)
12161260
return
12171261
end
12181262

@@ -1236,6 +1280,7 @@ local function checker_callback(self, health_mode)
12361280

12371281
if not list_to_check[1] then
12381282
self:log(DEBUG, "checking ", health_mode, " targets: nothing to do")
1283+
remove_callback_lock(self.shm, callback_lock)
12391284
else
12401285
local timer = resty_timer({
12411286
interval = 0,
@@ -1245,10 +1290,12 @@ local function checker_callback(self, health_mode)
12451290
expire = function()
12461291
self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check)
12471292
self:active_check_targets(list_to_check)
1293+
remove_callback_lock(self.shm, callback_lock)
12481294
end,
12491295
})
12501296
if timer == nil then
12511297
self:log(ERR, "failed to create timer to check ", health_mode)
1298+
remove_callback_lock(self.shm, callback_lock)
12521299
end
12531300
end
12541301
end
@@ -1618,6 +1665,7 @@ function _M.new(opts)
16181665
self.TARGET_LIST_LOCK = SHM_PREFIX .. self.name .. ":target_list_lock"
16191666
self.TARGET_LOCK = SHM_PREFIX .. self.name .. ":target_lock"
16201667
self.PERIODIC_LOCK = SHM_PREFIX .. ":period_lock:"
1668+
self.CALLBACK_LOCK = SHM_PREFIX .. self.name .. ":callback_lock:"
16211669
-- prepare constants
16221670
self.EVENT_SOURCE = EVENT_SOURCE_PREFIX .. " [" .. self.name .. "]"
16231671
self.LOG_PREFIX = LOG_PREFIX .. "(" .. self.name .. ") "

t/with_resty-events/14-tls_active_probes.t

+9-6
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ __DATA__
4646
events_module = "resty.events",
4747
checks = {
4848
active = {
49+
timeout = 2,
4950
type = "https",
5051
http_path = "/",
5152
healthy = {
@@ -60,7 +61,7 @@ events_module = "resty.events",
6061
}
6162
})
6263
local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false)
63-
ngx.sleep(8) -- wait for 4x the check interval
64+
ngx.sleep(16) -- wait for 4x the check interval
6465
ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true
6566
}
6667
}
@@ -69,7 +70,7 @@ GET /t
6970
--- response_body
7071
true
7172
--- timeout
72-
15
73+
20
7374

7475
=== TEST 2: active probes, invalid cert
7576
--- http_config eval: $::HttpConfig
@@ -87,6 +88,7 @@ true
8788
events_module = "resty.events",
8889
checks = {
8990
active = {
91+
timeout = 2,
9092
type = "https",
9193
http_path = "/",
9294
healthy = {
@@ -101,7 +103,7 @@ events_module = "resty.events",
101103
}
102104
})
103105
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true)
104-
ngx.sleep(8) -- wait for 4x the check interval
106+
ngx.sleep(16) -- wait for 4x the check interval
105107
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false
106108
}
107109
}
@@ -110,7 +112,7 @@ GET /t
110112
--- response_body
111113
false
112114
--- timeout
113-
15
115+
20
114116

115117
=== TEST 3: active probes, accept invalid cert when disabling check
116118
--- http_config eval: $::HttpConfig
@@ -128,6 +130,7 @@ false
128130
events_module = "resty.events",
129131
checks = {
130132
active = {
133+
timeout = 2,
131134
type = "https",
132135
https_verify_certificate = false,
133136
http_path = "/",
@@ -143,7 +146,7 @@ events_module = "resty.events",
143146
}
144147
})
145148
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false)
146-
ngx.sleep(8) -- wait for 4x the check interval
149+
ngx.sleep(16) -- wait for 4x the check interval
147150
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true
148151
}
149152
}
@@ -152,4 +155,4 @@ GET /t
152155
--- response_body
153156
true
154157
--- timeout
155-
15
158+
20

t/with_resty-events/19-timer.t

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
use Test::Nginx::Socket::Lua;
2+
use Cwd qw(cwd);
3+
4+
workers(1);
5+
6+
plan tests => repeat_each() * blocks() * 2;
7+
8+
my $pwd = cwd();
9+
$ENV{TEST_NGINX_SERVROOT} = server_root();
10+
11+
our $HttpConfig = qq{
12+
lua_package_path "$pwd/lib/?.lua;;";
13+
lua_shared_dict test_shm 8m;
14+
15+
init_worker_by_lua_block {
16+
local we = require "resty.events.compat"
17+
assert(we.configure({
18+
unique_timeout = 5,
19+
broker_id = 0,
20+
listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock"
21+
}))
22+
assert(we.configured())
23+
}
24+
25+
server {
26+
server_name kong_worker_events;
27+
listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock;
28+
access_log off;
29+
location / {
30+
content_by_lua_block {
31+
require("resty.events.compat").run()
32+
}
33+
}
34+
}
35+
};
36+
37+
run_tests();
38+
39+
__DATA__
40+
41+
42+
43+
44+
=== TEST 1: active probes, http node failing
45+
--- http_config eval
46+
qq{
47+
$::HttpConfig
48+
49+
server {
50+
listen 2130;
51+
location = /status {
52+
content_by_lua_block {
53+
ngx.sleep(2)
54+
ngx.exit(500);
55+
}
56+
}
57+
}
58+
}
59+
--- config
60+
location = /t {
61+
content_by_lua_block {
62+
local healthcheck = require("resty.healthcheck")
63+
local checker = healthcheck.new({
64+
name = "testing",
65+
shm_name = "test_shm",
66+
events_module = "resty.events",
67+
type = "http",
68+
checks = {
69+
active = {
70+
timeout = 1,
71+
http_path = "/status",
72+
healthy = {
73+
interval = 0.1,
74+
successes = 3,
75+
},
76+
unhealthy = {
77+
interval = 0.1,
78+
http_failures = 3,
79+
}
80+
},
81+
}
82+
})
83+
local ok, err = checker:add_target("127.0.0.1", 2130, nil, true)
84+
ngx.sleep(3) -- wait for some time to let the checks run
85+
-- There should be no more than 3 timers running atm, but
86+
-- add a few spaces for worker events
87+
ngx.say(tonumber(ngx.timer.running_count()) <= 5)
88+
}
89+
}
90+
--- request
91+
GET /t
92+
--- response_body
93+
true

t/with_worker-events/14-tls_active_probes.t

+9-6
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ __DATA__
3434
shm_name = "test_shm",
3535
checks = {
3636
active = {
37+
timeout = 2,
3738
type = "https",
3839
http_path = "/",
3940
healthy = {
@@ -48,7 +49,7 @@ __DATA__
4849
}
4950
})
5051
local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false)
51-
ngx.sleep(8) -- wait for 4x the check interval
52+
ngx.sleep(16) -- wait for 4x the check interval
5253
ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true
5354
}
5455
}
@@ -57,7 +58,7 @@ GET /t
5758
--- response_body
5859
true
5960
--- timeout
60-
15
61+
20
6162

6263
=== TEST 2: active probes, invalid cert
6364
--- http_config eval: $::HttpConfig
@@ -74,6 +75,7 @@ true
7475
shm_name = "test_shm",
7576
checks = {
7677
active = {
78+
timeout = 2,
7779
type = "https",
7880
http_path = "/",
7981
healthy = {
@@ -88,7 +90,7 @@ true
8890
}
8991
})
9092
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true)
91-
ngx.sleep(8) -- wait for 4x the check interval
93+
ngx.sleep(16) -- wait for 4x the check interval
9294
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false
9395
}
9496
}
@@ -97,7 +99,7 @@ GET /t
9799
--- response_body
98100
false
99101
--- timeout
100-
15
102+
20
101103

102104
=== TEST 3: active probes, accept invalid cert when disabling check
103105
--- http_config eval: $::HttpConfig
@@ -114,6 +116,7 @@ false
114116
shm_name = "test_shm",
115117
checks = {
116118
active = {
119+
timeout = 2,
117120
type = "https",
118121
https_verify_certificate = false,
119122
http_path = "/",
@@ -129,7 +132,7 @@ false
129132
}
130133
})
131134
local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false)
132-
ngx.sleep(8) -- wait for 4x the check interval
135+
ngx.sleep(16) -- wait for 4x the check interval
133136
ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true
134137
}
135138
}
@@ -138,4 +141,4 @@ GET /t
138141
--- response_body
139142
true
140143
--- timeout
141-
15
144+
20

0 commit comments

Comments
 (0)