From cdabfc3238750d0fd66422ca4918677da13d7db1 Mon Sep 17 00:00:00 2001 From: windmgc Date: Thu, 9 May 2024 16:18:31 +0800 Subject: [PATCH 01/14] fix(*): avoid creating multiple pending timers to run active check --- lib/resty/healthcheck.lua | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 4d05c9c..26fd512 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -33,6 +33,7 @@ local ipairs = ipairs local table_insert = table.insert local table_remove = table.remove local string_format = string.format +local math_max = math.max local ssl = require("ngx.ssl") local resty_timer = require "resty.timer" local bit = require("bit") @@ -1165,13 +1166,13 @@ end -- @return `true` on success, or false if the lock was not acquired, or `nil + error` -- in case of errors -local function get_periodic_lock(shm, key) +local function get_periodic_lock(shm, key, ttl) local my_pid = ngx_worker_pid() local checker_pid = shm:get(key) if checker_pid == nil then -- no worker is checking, try to acquire the lock - local ok, err = shm:add(key, my_pid, LOCK_PERIOD) + local ok, err = shm:add(key, my_pid, ttl or LOCK_PERIOD) if not ok then if err == "exists" then -- another worker got the lock before @@ -1200,6 +1201,11 @@ local function renew_periodic_lock(shm, key) end +local function remove_periodic_lock(shm, key) + return shm:delete(key) +end + + --- Active health check callback function. -- @param self the checker object this timer runs on -- @param health_mode either "healthy" or "unhealthy" to indicate what check @@ -1208,6 +1214,24 @@ local function checker_callback(self, health_mode) self.checker_callback_count = self.checker_callback_count + 1 end + -- Set a callback pending lock will exist for 2x the time of the active check. + -- An active check should be finished within this time and next timer will be + -- executed to exit a pending status. + -- Note that this does not influence the actual check interval, but only + -- limiting the number of scheduling another same check when the previous one + -- is still pending. + local callback_pending_ttl = (math_max(self.checks.active.healthy.active and + self.checks.active.healthy.interval or 0, + self.checks.active.unhealthy.active and + self.checks.active.unhealthy.interval or 0) + + self.checks.active.timeout) * 2 + + -- a pending timer already exists, so skip this time + local ok, _ = get_periodic_lock(self.shm, self.CALLBACK_LOCK, callback_pending_ttl) + if not ok then + return + end + local list_to_check = {} local targets, err = fetch_target_list(self) if not targets then @@ -1242,6 +1266,7 @@ local function checker_callback(self, health_mode) immediate = false, detached = true, expire = function() + remove_periodic_lock(self.shm, self.CALLBACK_LOCK) self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) self:active_check_targets(list_to_check) end, @@ -1614,6 +1639,7 @@ function _M.new(opts) self.TARGET_LIST_LOCK = SHM_PREFIX .. self.name .. ":target_list_lock" self.TARGET_LOCK = SHM_PREFIX .. self.name .. ":target_lock" self.PERIODIC_LOCK = SHM_PREFIX .. ":period_lock:" + self.CALLBACK_LOCK = SHM_PREFIX .. self.name .. ":callback_lock" -- prepare constants self.EVENT_SOURCE = EVENT_SOURCE_PREFIX .. " [" .. self.name .. "]" self.LOG_PREFIX = LOG_PREFIX .. "(" .. self.name .. ") " From 7568c3f82f89913e60e933c5c5ac75b649960a07 Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 13:37:09 +0800 Subject: [PATCH 02/14] fix(*): callback lock will be released after finish active hc --- lib/resty/healthcheck.lua | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 26fd512..72b0e84 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1201,7 +1201,28 @@ local function renew_periodic_lock(shm, key) end -local function remove_periodic_lock(shm, key) +local function get_callback_lock(shm, key, ttl) + local value = shm:get(key) + if value == nil then + -- no worker is checking, try to acquire the lock + local ok, err = shm:add(key, true, ttl or LOCK_PERIOD) + if not ok then + if err == "exists" then + -- another worker got the lock before + return false + end + + return nil, err + end + + return true + end + + return false +end + + +local function remove_callback_lock(shm, key) return shm:delete(key) end @@ -1217,9 +1238,6 @@ local function checker_callback(self, health_mode) -- Set a callback pending lock will exist for 2x the time of the active check. -- An active check should be finished within this time and next timer will be -- executed to exit a pending status. - -- Note that this does not influence the actual check interval, but only - -- limiting the number of scheduling another same check when the previous one - -- is still pending. local callback_pending_ttl = (math_max(self.checks.active.healthy.active and self.checks.active.healthy.interval or 0, self.checks.active.unhealthy.active and @@ -1227,7 +1245,7 @@ local function checker_callback(self, health_mode) + self.checks.active.timeout) * 2 -- a pending timer already exists, so skip this time - local ok, _ = get_periodic_lock(self.shm, self.CALLBACK_LOCK, callback_pending_ttl) + local ok, _ = get_callback_lock(self.shm, self.CALLBACK_LOCK, callback_pending_ttl) if not ok then return end @@ -1266,9 +1284,9 @@ local function checker_callback(self, health_mode) immediate = false, detached = true, expire = function() - remove_periodic_lock(self.shm, self.CALLBACK_LOCK) self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) self:active_check_targets(list_to_check) + remove_callback_lock(self.shm, self.CALLBACK_LOCK) end, }) if timer == nil then From fead0052815a5880f5e322baf91f80856785e077 Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 13:40:09 +0800 Subject: [PATCH 03/14] trigger ci From d42f9922c39bbff396cb2f578a1776331a8f9f4d Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 14:41:13 +0800 Subject: [PATCH 04/14] fix(*): lock callback by health mode --- lib/resty/healthcheck.lua | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 72b0e84..e75ea94 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1244,8 +1244,9 @@ local function checker_callback(self, health_mode) self.checks.active.unhealthy.interval or 0) + self.checks.active.timeout) * 2 + local callback_lock = self.CALLBACK_LOCK .. health_mode -- a pending timer already exists, so skip this time - local ok, _ = get_callback_lock(self.shm, self.CALLBACK_LOCK, callback_pending_ttl) + local ok, _ = get_callback_lock(self.shm, callback_lock, callback_pending_ttl) if not ok then return end @@ -1286,7 +1287,7 @@ local function checker_callback(self, health_mode) expire = function() self:log(DEBUG, "checking ", health_mode, " targets: #", #list_to_check) self:active_check_targets(list_to_check) - remove_callback_lock(self.shm, self.CALLBACK_LOCK) + remove_callback_lock(self.shm, callback_lock) end, }) if timer == nil then @@ -1657,7 +1658,7 @@ function _M.new(opts) self.TARGET_LIST_LOCK = SHM_PREFIX .. self.name .. ":target_list_lock" self.TARGET_LOCK = SHM_PREFIX .. self.name .. ":target_lock" self.PERIODIC_LOCK = SHM_PREFIX .. ":period_lock:" - self.CALLBACK_LOCK = SHM_PREFIX .. self.name .. ":callback_lock" + self.CALLBACK_LOCK = SHM_PREFIX .. self.name .. ":callback_lock:" -- prepare constants self.EVENT_SOURCE = EVENT_SOURCE_PREFIX .. " [" .. self.name .. "]" self.LOG_PREFIX = LOG_PREFIX .. "(" .. self.name .. ") " From 53503e8ba47b49c8e5f54e6b4418c1655efa1841 Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 14:48:29 +0800 Subject: [PATCH 05/14] fix(*): remove callback lock --- lib/resty/healthcheck.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index e75ea94..2726794 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1255,6 +1255,7 @@ local function checker_callback(self, health_mode) local targets, err = fetch_target_list(self) if not targets then self:log(ERR, "checker_callback: ", err) + remove_callback_lock(self.shm, callback_lock) return end @@ -1278,6 +1279,7 @@ local function checker_callback(self, health_mode) if not list_to_check[1] then self:log(DEBUG, "checking ", health_mode, " targets: nothing to do") + remove_callback_lock(self.shm, callback_lock) else local timer = resty_timer({ interval = 0, @@ -1292,6 +1294,7 @@ local function checker_callback(self, health_mode) }) if timer == nil then self:log(ERR, "failed to create timer to check ", health_mode) + remove_callback_lock(self.shm, callback_lock) end end end From cf8b13c0744100326d9b992d87f69e30abfe236b Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 15:38:26 +0800 Subject: [PATCH 06/14] fix(*): cleanup unnecessary change --- lib/resty/healthcheck.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/resty/healthcheck.lua b/lib/resty/healthcheck.lua index 2726794..2477abb 100644 --- a/lib/resty/healthcheck.lua +++ b/lib/resty/healthcheck.lua @@ -1166,13 +1166,13 @@ end -- @return `true` on success, or false if the lock was not acquired, or `nil + error` -- in case of errors -local function get_periodic_lock(shm, key, ttl) +local function get_periodic_lock(shm, key) local my_pid = ngx_worker_pid() local checker_pid = shm:get(key) if checker_pid == nil then -- no worker is checking, try to acquire the lock - local ok, err = shm:add(key, my_pid, ttl or LOCK_PERIOD) + local ok, err = shm:add(key, my_pid, LOCK_PERIOD) if not ok then if err == "exists" then -- another worker got the lock before From 3f737f0ed4ddcfb6fabed8c9d5679f3c8648605f Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 15:38:44 +0800 Subject: [PATCH 07/14] tests(*): add timeout for badssl due to long latency --- t/with_worker-events/14-tls_active_probes.t | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/t/with_worker-events/14-tls_active_probes.t b/t/with_worker-events/14-tls_active_probes.t index 5185492..1b0203e 100644 --- a/t/with_worker-events/14-tls_active_probes.t +++ b/t/with_worker-events/14-tls_active_probes.t @@ -34,6 +34,7 @@ __DATA__ shm_name = "test_shm", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -48,7 +49,7 @@ __DATA__ } }) local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true } } @@ -74,6 +75,7 @@ true shm_name = "test_shm", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -88,7 +90,7 @@ true } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false } } @@ -114,6 +116,7 @@ false shm_name = "test_shm", checks = { active = { + timeout = 2, type = "https", https_verify_certificate = false, http_path = "/", @@ -129,7 +132,7 @@ false } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true } } From ae1ea0095d11811a66f5714a81c7371f4cdd743d Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 15:54:10 +0800 Subject: [PATCH 08/14] tests(*): raise test timeout --- t/with_resty-events/14-tls_active_probes.t | 15 +++++++++------ t/with_worker-events/14-tls_active_probes.t | 6 +++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/t/with_resty-events/14-tls_active_probes.t b/t/with_resty-events/14-tls_active_probes.t index d9e4490..f4c175e 100644 --- a/t/with_resty-events/14-tls_active_probes.t +++ b/t/with_resty-events/14-tls_active_probes.t @@ -46,6 +46,7 @@ __DATA__ events_module = "resty.events", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -60,7 +61,7 @@ events_module = "resty.events", } }) local ok, err = checker:add_target("104.154.89.105", 443, "badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "badssl.com")) -- true } } @@ -69,7 +70,7 @@ GET /t --- response_body true --- timeout -15 +20 === TEST 2: active probes, invalid cert --- http_config eval: $::HttpConfig @@ -87,6 +88,7 @@ true events_module = "resty.events", checks = { active = { + timeout = 2, type = "https", http_path = "/", healthy = { @@ -101,7 +103,7 @@ events_module = "resty.events", } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", true) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- false } } @@ -110,7 +112,7 @@ GET /t --- response_body false --- timeout -15 +20 === TEST 3: active probes, accept invalid cert when disabling check --- http_config eval: $::HttpConfig @@ -128,6 +130,7 @@ false events_module = "resty.events", checks = { active = { + timeout = 2, type = "https", https_verify_certificate = false, http_path = "/", @@ -143,7 +146,7 @@ events_module = "resty.events", } }) local ok, err = checker:add_target("104.154.89.105", 443, "wrong.host.badssl.com", false) - ngx.sleep(8) -- wait for 4x the check interval + ngx.sleep(16) -- wait for 4x the check interval ngx.say(checker:get_target_status("104.154.89.105", 443, "wrong.host.badssl.com")) -- true } } @@ -152,4 +155,4 @@ GET /t --- response_body true --- timeout -15 +20 diff --git a/t/with_worker-events/14-tls_active_probes.t b/t/with_worker-events/14-tls_active_probes.t index 1b0203e..a9d854b 100644 --- a/t/with_worker-events/14-tls_active_probes.t +++ b/t/with_worker-events/14-tls_active_probes.t @@ -58,7 +58,7 @@ GET /t --- response_body true --- timeout -15 +20 === TEST 2: active probes, invalid cert --- http_config eval: $::HttpConfig @@ -99,7 +99,7 @@ GET /t --- response_body false --- timeout -15 +20 === TEST 3: active probes, accept invalid cert when disabling check --- http_config eval: $::HttpConfig @@ -141,4 +141,4 @@ GET /t --- response_body true --- timeout -15 +20 From 7319ffd3e20a574443056bef4852b28a8a4142a3 Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 15:56:08 +0800 Subject: [PATCH 09/14] trigger ci From 29f3ec5e2b55080fafc6865902c531102f13b62c Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 17:58:12 +0800 Subject: [PATCH 10/14] tests(*): add number of timers check test --- t/with_resty-events/19-timer.t | 92 +++++++++++++++++++++++++++++++++ t/with_worker-events/19-timer.t | 72 ++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 t/with_resty-events/19-timer.t create mode 100644 t/with_worker-events/19-timer.t diff --git a/t/with_resty-events/19-timer.t b/t/with_resty-events/19-timer.t new file mode 100644 index 0000000..c3f181b --- /dev/null +++ b/t/with_resty-events/19-timer.t @@ -0,0 +1,92 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * blocks() * 2; + +my $pwd = cwd(); +$ENV{TEST_NGINX_SERVROOT} = server_root(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + + init_worker_by_lua_block { + local we = require "resty.events.compat" + assert(we.configure({ + unique_timeout = 5, + broker_id = 0, + listening = "unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock" + })) + assert(we.configured()) + } + + server { + server_name kong_worker_events; + listen unix:$ENV{TEST_NGINX_SERVROOT}/worker_events.sock; + access_log off; + location / { + content_by_lua_block { + require("resty.events.compat").run() + } + } + } +}; + +run_tests(); + +__DATA__ + + + + +=== TEST 1: active probes, http node failing +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2130; + location = /status { + content_by_lua_block { + ngx.sleep(2) + ngx.exit(500); + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + type = "http", + checks = { + active = { + timeout = 1, + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + local ok, err = checker:add_target("127.0.0.1", 2130, nil, true) + ngx.sleep(4) -- wait for some time to let the checks run + -- There should be no more than 3 timers running atm, but + -- add a few spaces for worker events + ngx.say(tonumber(ngx.timer.running_count()) <= 5) + } + } +--- request +GET /t +--- response_body +true diff --git a/t/with_worker-events/19-timer.t b/t/with_worker-events/19-timer.t new file mode 100644 index 0000000..cd67617 --- /dev/null +++ b/t/with_worker-events/19-timer.t @@ -0,0 +1,72 @@ +use Test::Nginx::Socket::Lua; +use Cwd qw(cwd); + +workers(1); + +plan tests => repeat_each() * blocks() * 2; + +my $pwd = cwd(); + +our $HttpConfig = qq{ + lua_package_path "$pwd/lib/?.lua;;"; + lua_shared_dict test_shm 8m; + lua_shared_dict my_worker_events 8m; +}; + +run_tests(); + +__DATA__ + + + +=== TEST 1: active probes, http node failing +--- http_config eval +qq{ + $::HttpConfig + + server { + listen 2130; + location = /status { + content_by_lua_block { + ngx.sleep(2) + ngx.exit(500); + } + } + } +} +--- config + location = /t { + content_by_lua_block { + local we = require "resty.worker.events" + assert(we.configure{ shm = "my_worker_events", interval = 0.1 }) + local healthcheck = require("resty.healthcheck") + local checker = healthcheck.new({ + name = "testing", + shm_name = "test_shm", + type = "http", + checks = { + active = { + timeout = 1, + http_path = "/status", + healthy = { + interval = 0.1, + successes = 3, + }, + unhealthy = { + interval = 0.1, + http_failures = 3, + } + }, + } + }) + local ok, err = checker:add_target("127.0.0.1", 2130, nil, true) + ngx.sleep(4) -- wait for some time to let the checks run + -- There should be no more than 3 timers running atm, but + -- add a few spaces for worker events + ngx.say(tonumber(ngx.timer.running_count()) <= 5) + } + } +--- request +GET /t +--- response_body +true From c70ba41a7e6f207123ba038834dc5f0efb0e80ba Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 18:01:53 +0800 Subject: [PATCH 11/14] trigger test From 5996cb78824e07f49b906e4ebc4d5ccabacf52c6 Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 20:05:52 +0800 Subject: [PATCH 12/14] tests(*): fix test for resty events --- t/with_resty-events/19-timer.t | 1 + 1 file changed, 1 insertion(+) diff --git a/t/with_resty-events/19-timer.t b/t/with_resty-events/19-timer.t index c3f181b..103aa17 100644 --- a/t/with_resty-events/19-timer.t +++ b/t/with_resty-events/19-timer.t @@ -63,6 +63,7 @@ qq{ local checker = healthcheck.new({ name = "testing", shm_name = "test_shm", + events_module = "resty.events", type = "http", checks = { active = { From b97177d108c8dc2e7c3f127031e9e925ec0189f8 Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 20:27:14 +0800 Subject: [PATCH 13/14] tests(*): reduce sleep time a bit to avoid timeout --- t/with_resty-events/19-timer.t | 2 +- t/with_worker-events/19-timer.t | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/t/with_resty-events/19-timer.t b/t/with_resty-events/19-timer.t index 103aa17..fa2d510 100644 --- a/t/with_resty-events/19-timer.t +++ b/t/with_resty-events/19-timer.t @@ -81,7 +81,7 @@ qq{ } }) local ok, err = checker:add_target("127.0.0.1", 2130, nil, true) - ngx.sleep(4) -- wait for some time to let the checks run + ngx.sleep(3) -- wait for some time to let the checks run -- There should be no more than 3 timers running atm, but -- add a few spaces for worker events ngx.say(tonumber(ngx.timer.running_count()) <= 5) diff --git a/t/with_worker-events/19-timer.t b/t/with_worker-events/19-timer.t index cd67617..b871b47 100644 --- a/t/with_worker-events/19-timer.t +++ b/t/with_worker-events/19-timer.t @@ -60,7 +60,7 @@ qq{ } }) local ok, err = checker:add_target("127.0.0.1", 2130, nil, true) - ngx.sleep(4) -- wait for some time to let the checks run + ngx.sleep(3) -- wait for some time to let the checks run -- There should be no more than 3 timers running atm, but -- add a few spaces for worker events ngx.say(tonumber(ngx.timer.running_count()) <= 5) From 9cedd47cc5062ba03f7e8f5d19a6653c571c4941 Mon Sep 17 00:00:00 2001 From: windmgc Date: Fri, 10 May 2024 20:29:03 +0800 Subject: [PATCH 14/14] trigger test