Skip to content

Commit

Permalink
fix(lock) handle all resty.lock failure modes (#112)
Browse files Browse the repository at this point in the history
* fix(lock) handle all resty.lock failure modes

This fixes a bug in the `run_fn_locked_target_list` function. The
function wraps `resty.lock:lock()` with pcall, but it was only checking
the pcall return status and not the actual result of the lock operation.
Therefore it would continue even if no lock was acquired.

Additionally, I expanded the error-handling to explicitly check for
known errors returned by `resty.lock:lock()` ("timeout" and "locked").
This ensures that we only retry the function if the lock operation
failed in a recoverable way.

* refactor(lock) use special run_locked helper

* fix(healthchecker) handle fetch_target_list failure in checker callback

* refactor(lock) don't return function value in timer

* fix(lock) change run_locked return signature when rescheduling

* fix(lock) fix return check conditional

* chore(*) revert a docstring change

* chore(*) re-organize dependencies and cached locals

* chore(lock) add explanation for resty_lock's limited scope
  • Loading branch information
flrgh authored Jul 6, 2022
1 parent 50573b3 commit c0950b9
Showing 1 changed file with 162 additions and 104 deletions.
266 changes: 162 additions & 104 deletions lib/resty/healthcheck.lua
Original file line number Diff line number Diff line change
Expand Up @@ -24,30 +24,36 @@
-- @author Hisham Muhammad, Thijs Schreijer
-- @license Apache 2.0

local bit = require("bit")
local cjson = require("cjson.safe").new()
local resty_timer = require("resty.timer")
local ssl = require("ngx.ssl")
local worker_events = require("resty.worker.events")
-- local resty_lock = require("resty.lock") -- required later in the file"

local ERR = ngx.ERR
local WARN = ngx.WARN
local DEBUG = ngx.DEBUG
local ngx_log = ngx.log
local re_find = ngx.re.find
local ngx_worker_exiting = ngx.worker.exiting
local get_phase = ngx.get_phase

local tostring = tostring
local ipairs = ipairs
local cjson = require("cjson.safe").new()
local pcall = pcall
local type = type
local assert = assert

local table_remove = table.remove
local table_concat = table.concat
local string_format = string.format
local resty_timer = require("resty.timer")
local worker_events = require("resty.worker.events")
local resty_lock = require ("resty.lock")
local re_find = ngx.re.find
local bit = require("bit")
local ngx_worker_exiting = ngx.worker.exiting
local ssl = require("ngx.ssl")

local new_tab
local nkeys
local is_array

do
local pcall = pcall
local ok

ok, new_tab = pcall(require, "table.new")
Expand Down Expand Up @@ -202,7 +208,7 @@ local ngx_timer_at do
end

ngx_timer_at = function(...)
local phase = ngx.get_phase()
local phase = get_phase()
if phase ~= "init" and phase ~= "init_worker" then
-- we're past init/init_worker, so replace this temp function with the
-- real-deal again, so from here on we run regular timers.
Expand All @@ -222,6 +228,128 @@ local ngx_timer_at do
end


local run_locked
do
-- resty_lock is restricted to this scope in order to keep sensitive
-- lock-handling code separate separate from all other business logic
--
-- If you need to use resty_lock in a way that is not covered by the
-- `run_locked` helper function defined below, it's strongly-advised to
-- define it fully within this scope unless you have a very good reason
--
-- (see https://github.com/Kong/lua-resty-healthcheck/pull/112)
local resty_lock = require "resty.lock"

local yieldable = {
rewrite = true,
access = true,
content = true,
timer = true,
}

local function run_in_timer(premature, fn, ...)
if not premature then
fn(...)
end
end

local function schedule(fn, ...)
return ngx_timer_at(0, run_in_timer, fn, ...)
end

-- timeout when yieldable
local timeout = 5

-- resty.lock consumes these options immediately, so this table can be reused
local opts = {
exptime = 10, -- timeout after which lock is released anyway
timeout = timeout, -- max wait time to acquire lock
}

---
-- Acquire a lock and run a function
--
-- The function call itself is wrapped with `pcall` to protect against
-- exception.
--
-- This function exhibits some special behavior when called during a
-- non-yieldable phase such as `init_worker` or `log`:
--
-- 1. The lock timeout is set to 0 to ensure that `resty.lock` does not
-- attempt to sleep/yield
-- 2. If acquiring the lock fails due to a timeout, `run_locked`
-- (this function) is re-scheduled to run in a timer. In this case,
-- the function returns `"scheduled"` instead of the return value of
-- the locked function
--
-- @param self The checker object
-- @param key the key/identifier to acquire a lock for
-- @param fn The function to execute
-- @param ... arguments that will be passed to fn
-- @return The results of the function; or nil and an error message
-- in case it fails locking.
function run_locked(self, key, fn, ...)
-- we're extra extra extra defensive in this code path
local typ = type(key)
-- XXX is a number key ever expected?
assert(typ == "string" or typ == "number",
"unexpected lock key type: " .. typ)
key = tostring(key)

-- first aqcuire a lock or conditionally re-schedule ourselves in a timer
local lock
do
local yield = yieldable[get_phase()]

if yield then
opts.timeout = timeout
else
-- if yielding is not possible in the current phase, use a zero timeout
-- so that resty.lock will return `nil, "timeout"` immediately instead of
-- calling ngx.sleep()
opts.timeout = 0
end

local err
lock, err = resty_lock:new(self.shm_name, opts)
if not lock then
return nil, "failed creating lock for '" .. key .. "', " .. err
end

local elapsed
elapsed, err = lock:lock(key)

if not elapsed and err == "timeout" and not yield then
-- yielding is not possible in the current phase, so retry in a timer
local ok, terr = schedule(run_locked, self, key, fn, ...)
if not ok then
return nil, terr
end

return "scheduled"

elseif not elapsed then
return nil, "failed acquiring lock for '" .. key .. "', " .. err
end
end

local pok, perr, res = pcall(fn, ...)

local ok, err = lock:unlock()
if not ok then
self:log(ERR, "failed unlocking '", key, "', ", err)
end

if not pok then
return nil, perr
else
return perr, res
end
end
end



local _M = {}


Expand Down Expand Up @@ -264,66 +392,28 @@ local function fetch_target_list(self)
end


--- Helper function to run the function holding a lock on the target list.
-- @see locking_target_list
local function run_fn_locked_target_list(premature, self, fn)

if premature then
return
end

local tl_lock, lock_err = resty_lock:new(self.shm_name, {
exptime = 10, -- timeout after which lock is released anyway
timeout = 5, -- max wait time to acquire lock
})

if not tl_lock then
return nil, "failed to create lock:" .. lock_err
end

local pok, perr = pcall(tl_lock.lock, tl_lock, self.TARGET_LIST_LOCK)
if not pok then
self:log(DEBUG, "failed to acquire lock: ", perr)
return nil, "failed to acquire lock"
end

local target_list, err = fetch_target_list(self)

local final_ok, final_err

if target_list then
final_ok, final_err = pcall(fn, target_list)
else
final_ok, final_err = nil, err
end

local ok
ok, err = tl_lock:unlock()
if not ok then
-- recoverable: not returning this error, only logging it
self:log(ERR, "failed to release lock '", self.TARGET_LIST_LOCK,
"': ", err)
local function with_target_list(self, fn)
local targets, err = fetch_target_list(self)
if not targets then
return nil, err
end

return final_ok, final_err
-- this is only ever called in the context of `run_locked`,
-- so no pcall needed
return fn(targets)
end


--- Run the given function holding a lock on the target list.
-- @param self The checker object
-- @param fn The function to execute
-- @return The results of the function; or nil and an error message
-- in case it fails locking.
-- @return The results of the function; "scheduled" if the function was
-- scheduled in a timer, or nil and an error message in case of failure
local function locking_target_list(self, fn)
local ok, err = run_locked(self, self.TARGET_LIST_LOCK, with_target_list, self, fn)

local ok, err = run_fn_locked_target_list(false, self, fn)
if err == "failed to acquire lock" then
local _, terr = ngx_timer_at(0, run_fn_locked_target_list, self, fn)
if terr ~= nil then
return nil, terr
end

return true
if ok == "scheduled" then
self:log(DEBUG, "target_list function re-scheduled in timer")
end

return ok, err
Expand Down Expand Up @@ -533,58 +623,21 @@ end
------------------------------------------------------------------------------


--- Helper function to actually run the function holding a lock on the target.
-- @see locking_target
local function run_mutexed_fn(premature, self, ip, port, hostname, fn)
if premature then
return
end

local tlock, lock_err = resty_lock:new(self.shm_name, {
exptime = 10, -- timeout after which lock is released anyway
timeout = 5, -- max wait time to acquire lock
})
if not tlock then
return nil, "failed to create lock:" .. lock_err
end
local lock_key = key_for(self.TARGET_LOCK, ip, port, hostname)

local pok, perr = pcall(tlock.lock, tlock, lock_key)
if not pok then
self:log(DEBUG, "failed to acquire lock: ", perr)
return nil, "failed to acquire lock"
end

local final_ok, final_err = pcall(fn)

local ok, err = tlock:unlock()
if not ok then
-- recoverable: not returning this error, only logging it
self:log(ERR, "failed to release lock '", lock_key, "': ", err)
end

return final_ok, final_err

end


-- Run the given function holding a lock on the target.
-- @param self The checker object
-- @param ip Target IP
-- @param port Target port
-- @param hostname Target hostname
-- @param fn The function to execute
-- @return The results of the function; or true in case it fails locking and
-- @return The results of the function; or "scheduled" in case it fails locking and
-- will retry asynchronously; or nil+err in case it fails to retry.
local function locking_target(self, ip, port, hostname, fn)
local ok, err = run_mutexed_fn(false, self, ip, port, hostname, fn)
if err == "failed to acquire lock" then
local _, terr = ngx_timer_at(0, run_mutexed_fn, self, ip, port, hostname, fn)
if terr ~= nil then
return nil, terr
end
local key = key_for(self.TARGET_LOCK, ip, port, hostname)

return true
local ok, err = run_locked(self, key, fn)

if ok == "scheduled" then
self:log(DEBUG, "target function for ", key, " was re-scheduled")
end

return ok, err
Expand Down Expand Up @@ -1100,7 +1153,12 @@ local function checker_callback(self, health_mode)

-- create a list of targets to check, here we can still do this atomically
local list_to_check = {}
local targets = fetch_target_list(self)
local targets, err = fetch_target_list(self)
if not targets then
self:log(ERR, "checker_callback: ", err)
return
end

for _, target in ipairs(targets) do
local tgt = get_target(self, target.ip, target.port, target.hostname)
local internal_health = tgt and tgt.internal_health or nil
Expand Down

0 comments on commit c0950b9

Please sign in to comment.