From cea68f3c6769dc65183361fb0b752a83977065d4 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Mon, 23 Sep 2024 23:45:28 +0200 Subject: [PATCH 01/17] test: use vardir for repository copying Upgrade-tests use git to clone the current repository and check the necessary versions out. The cloned repository was always saved in vshard/test/var, even when the actual --var argument for test-run.py was /tmp/var. Lets better make the copied code also stored by the path given in --var. NO_DOC=internal --- test/lua_libs/util.lua | 4 +++- test/reload_evolution/storage.result | 4 ++-- test/reload_evolution/storage.test.lua | 4 ++-- test/reload_evolution/storage_1_a.lua | 12 +++--------- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/test/lua_libs/util.lua b/test/lua_libs/util.lua index 3f435ccd..fd9d947e 100644 --- a/test/lua_libs/util.lua +++ b/test/lua_libs/util.lua @@ -181,8 +181,10 @@ if not BUILDDIR then BUILDDIR = SOURCEDIR end +local VARDIR = fio.abspath(os.getenv('VARDIR') or './') + local function git_checkout(dst_dir, version) - local vshard_copy_path = BUILDDIR..'/test/var/'..dst_dir + local vshard_copy_path = VARDIR..'/'..dst_dir -- Cleanup the directory after a previous build. os.execute('rm -rf ' .. vshard_copy_path) -- `git worktree` cannot be used because PACKPACK mounts diff --git a/test/reload_evolution/storage.result b/test/reload_evolution/storage.result index 4a1b3a09..74cf355a 100644 --- a/test/reload_evolution/storage.result +++ b/test/reload_evolution/storage.result @@ -24,10 +24,10 @@ REPLICASET_1 = { 'storage_1_a', 'storage_1_b' } REPLICASET_2 = { 'storage_2_a', 'storage_2_b' } --- ... -test_run:create_cluster(REPLICASET_1, 'reload_evolution') +test_run:create_cluster(REPLICASET_1, 'reload_evolution', {args = vshard_copy_path}) --- ... -test_run:create_cluster(REPLICASET_2, 'reload_evolution') +test_run:create_cluster(REPLICASET_2, 'reload_evolution', {args = vshard_copy_path}) --- ... util = require('util') diff --git a/test/reload_evolution/storage.test.lua b/test/reload_evolution/storage.test.lua index 3800309a..02a9cca6 100644 --- a/test/reload_evolution/storage.test.lua +++ b/test/reload_evolution/storage.test.lua @@ -12,8 +12,8 @@ vshard_copy_path = util.git_checkout('vshard_git_tree_copy', REPLICASET_1 = { 'storage_1_a', 'storage_1_b' } REPLICASET_2 = { 'storage_2_a', 'storage_2_b' } -test_run:create_cluster(REPLICASET_1, 'reload_evolution') -test_run:create_cluster(REPLICASET_2, 'reload_evolution') +test_run:create_cluster(REPLICASET_1, 'reload_evolution', {args = vshard_copy_path}) +test_run:create_cluster(REPLICASET_2, 'reload_evolution', {args = vshard_copy_path}) util = require('util') util.wait_master(test_run, REPLICASET_1, 'storage_1_a') util.wait_master(test_run, REPLICASET_2, 'storage_2_a') diff --git a/test/reload_evolution/storage_1_a.lua b/test/reload_evolution/storage_1_a.lua index eb189a0e..f4b19a0e 100755 --- a/test/reload_evolution/storage_1_a.lua +++ b/test/reload_evolution/storage_1_a.lua @@ -1,16 +1,10 @@ #!/usr/bin/env tarantool local util = require('util') NAME = require('fio').basename(arg[0], '.lua') - --- Run one storage on a different vshard version. --- To do that, place vshard src to --- BUILDDIR/test/var/vshard_git_tree_copy/. +local source_path = arg[1] original_package_path = package.path if NAME == 'storage_2_a' then - vshard_copy = util.BUILDDIR .. '/test/var/vshard_git_tree_copy' - package.path = string.format( - '%s/?.lua;%s/?/init.lua;%s', - vshard_copy, vshard_copy, original_package_path - ) + package.path = string.format('%s/?.lua;%s/?/init.lua;%s', source_path, + source_path, package.path) end require('storage_template') From c437f69a928163a946e9e07658b5d6c14d03b1bf Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Thu, 10 Oct 2024 18:45:54 +0200 Subject: [PATCH 02/17] test: enable strict mode in Lua It makes the unknown variables treated as errors, not as 'nil's. Otherwise it is too easy to use a wrong variable name somewhere and get it as nil, and the tests would even pass, but not test what they are supposed to. NO_DOC=internal --- test/luatest_helpers/vtest.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/luatest_helpers/vtest.lua b/test/luatest_helpers/vtest.lua index a4fc2091..1a751532 100644 --- a/test/luatest_helpers/vtest.lua +++ b/test/luatest_helpers/vtest.lua @@ -8,6 +8,9 @@ local yaml = require('yaml') local vcfg = require('vshard.cfg') local vrepset = require('vshard.replicaset') local log = require('log') +-- Otherwise in non-Debug builds apparently the unknown variables are treated as +-- nil-global-variables. +require('strict').on() local wait_timeout = 50 -- Use it in busy-loops like `while !cond do fiber.sleep(busy_step) end`. From 5e3ebb4840d602f00e3a959ca2f79b1863c61080 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Tue, 24 Sep 2024 21:37:44 +0200 Subject: [PATCH 03/17] router: extract a couple of map-reduce helpers The only map-reduce function is router.map_callrw(). At some point there was a task to introduce a new mode of Map-Reduce - partial, by bucket IDs, not on the whole cluster. For that task was introduced a new function router.map_part_callrw() which had the same Map-Reduce and error handling stages. Only Ref stage was different. The new helpers in this commit were supposed to reuse some code between those two map-call functions. Later it was decided to leave just one map-call function and add a new option to it. But these new helpers still look useful to have as separate functions. They make the map-call function really small and simple. NO_DOC=internal NO_TEST=refactoring --- vshard/router/init.lua | 174 +++++++++++++++++++++++------------------ 1 file changed, 98 insertions(+), 76 deletions(-) diff --git a/vshard/router/init.lua b/vshard/router/init.lua index cbd978ef..f16fb69e 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -761,6 +761,95 @@ local function router_call(router, bucket_id, opts, ...) ...) end +-- +-- Perform map-reduce stages on the given set of replicasets. The map expects a +-- valid ref ID which must be done beforehand. +-- +local function replicasets_map_reduce(replicasets, rid, func, args, opts) + assert(opts) + local timeout = opts.timeout or consts.CALL_TIMEOUT_MIN + local do_return_raw = opts.return_raw + local deadline = fiber_clock() + timeout + local opts_map = {is_async = true, return_raw = do_return_raw} + local futures = {} + local map = {} + -- + -- Map stage: send. + -- + args = {'storage_map', rid, func, args} + for _, rs in pairs(replicasets) do + local res, err = rs:callrw('vshard.storage._call', args, opts_map) + if res == nil then + return nil, err, rs.id + end + futures[rs.id] = res + end + -- + -- Map stage: collect. + -- + if do_return_raw then + for id, f in pairs(futures) do + local res, err = future_wait(f, timeout) + if res == nil then + return nil, err, id + end + -- Map returns true,res or nil,err. + res = res:iterator() + local count = res:decode_array_header() + local ok = res:decode() + if ok == nil then + return nil, res:decode(), id + end + if count > 1 then + map[id] = res:take_array(count - 1) + end + timeout = deadline - fiber_clock() + end + else + for id, f in pairs(futures) do + local res, err = future_wait(f, timeout) + if res == nil then + return nil, err, id + end + local ok + -- Map returns true,res or nil,err. + ok, res = res[1], res[2] + if ok == nil then + return nil, res, id + end + if res ~= nil then + -- Store as a table so in future it could be extended for + -- multireturn. + map[id] = {res} + end + timeout = deadline - fiber_clock() + end + end + return map +end + +-- +-- Cancel whatever pending requests are still waiting for a response and free +-- the given ref ID on all the affected storages. This is helpful when +-- map-reduce breaks in the middle. Makes sense to let the refs go to unblock +-- the rebalancer. +-- +local function replicasets_map_cancel_refs(replicasets, futures, rid) + local opts_async = {is_async = true} + for id, f in pairs(futures) do + f:discard() + -- Best effort to remove the created refs before exiting. Can help if + -- the timeout was big and the error happened early. + f = replicasets[id]:callrw('vshard.storage._call', + {'storage_unref', rid}, opts_async) + if f ~= nil then + -- Don't care waiting for a result - no time for this. But it won't + -- affect the request sending if the connection is still alive. + f:discard() + end + end +end + -- -- Consistent Map-Reduce. The given function is called on all masters in the -- cluster with a guarantee that in case of success it was executed with all @@ -811,11 +900,10 @@ local function router_map_callrw(router, func, args, opts) timeout = consts.CALL_TIMEOUT_MIN end local deadline = fiber_clock() + timeout - local err, err_id, res, ok, map + local err, err_id, res, map local futures = {} local bucket_count = 0 - local opts_ref = {is_async = true} - local opts_map = {is_async = true, return_raw = do_return_raw} + local opts_async = {is_async = true} local rs_count = 0 local rid = M.ref_id M.ref_id = rid + 1 @@ -834,7 +922,7 @@ local function router_map_callrw(router, func, args, opts) goto fail end res, err = rs:callrw('vshard.storage._call', - {'storage_ref', rid, timeout}, opts_ref) + {'storage_ref', rid, timeout}, opts_async) if res == nil then err_id = id goto fail @@ -842,7 +930,6 @@ local function router_map_callrw(router, func, args, opts) futures[id] = res rs_count = rs_count + 1 end - map = table_new(0, rs_count) -- -- Ref stage: collect. -- @@ -873,79 +960,14 @@ local function router_map_callrw(router, func, args, opts) router.total_bucket_count - bucket_count) goto fail end - -- - -- Map stage: send. - -- - args = {'storage_map', rid, func, args} - for id, rs in pairs(replicasets) do - res, err = rs:callrw('vshard.storage._call', args, opts_map) - if res == nil then - err_id = id - goto fail - end - futures[id] = res + map, err, err_id = replicasets_map_reduce(replicasets, rid, func, args, { + timeout = timeout, return_raw = do_return_raw + }) + if map then + return map end - -- - -- Ref stage: collect. - -- - if do_return_raw then - for id, f in pairs(futures) do - res, err = future_wait(f, timeout) - if res == nil then - err_id = id - goto fail - end - -- Map returns true,res or nil,err. - res = res:iterator() - local count = res:decode_array_header() - ok = res:decode() - if ok == nil then - err = res:decode() - err_id = id - goto fail - end - if count > 1 then - map[id] = res:take_array(count - 1) - end - timeout = deadline - fiber_clock() - end - else - for id, f in pairs(futures) do - res, err = future_wait(f, timeout) - if res == nil then - err_id = id - goto fail - end - -- Map returns true,res or nil,err. - ok, res = res[1], res[2] - if ok == nil then - err = res - err_id = id - goto fail - end - if res ~= nil then - -- Store as a table so in future it could be extended for - -- multireturn. - map[id] = {res} - end - timeout = deadline - fiber_clock() - end - end - do return map end - ::fail:: - for id, f in pairs(futures) do - f:discard() - -- Best effort to remove the created refs before exiting. Can help if - -- the timeout was big and the error happened early. - f = replicasets[id]:callrw('vshard.storage._call', - {'storage_unref', rid}, opts_ref) - if f ~= nil then - -- Don't care waiting for a result - no time for this. But it won't - -- affect the request sending if the connection is still alive. - f:discard() - end - end + replicasets_map_cancel_refs(replicasets, futures, rid) err = lerror.make(err) return nil, err, err_id end From 165dc1801d40125afd2d67d4ef3a25c0f10ab3f6 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Wed, 9 Oct 2024 20:20:08 +0200 Subject: [PATCH 04/17] storage: introduce ref.check() It ensures the ref is still in place. A read-only operation. It is going to be used in the future commits about partial map-reduce. Router will be going potentially more than once to some storages and at all times it would ensure the ref is still in place. NO_DOC=internal --- test/storage/ref.result | 42 +++++++++++++++++++++++++++++++++++++++ test/storage/ref.test.lua | 16 +++++++++++++++ vshard/storage/ref.lua | 18 +++++++++++++++++ 3 files changed, 76 insertions(+) diff --git a/test/storage/ref.result b/test/storage/ref.result index a81ab5d8..d53176bd 100644 --- a/test/storage/ref.result +++ b/test/storage/ref.result @@ -249,6 +249,11 @@ function make_ref(rid, timeout) end | --- | ... +function check_ref(rid) \ + return lref.check(rid, box.session.id()) \ +end + | --- + | ... function use_ref(rid) \ return lref.use(rid, box.session.id()) \ end @@ -289,6 +294,43 @@ _ = test_run:switch('storage_1_a') | --- | ... +-- Check works. +ok, err = c:call('check_ref', {1}) + | --- + | ... +assert(ok and not err) + | --- + | - true + | ... +_ = test_run:switch('storage_2_a') + | --- + | ... +assert(lref.count == 1) + | --- + | - true + | ... +_ = test_run:switch('storage_1_a') + | --- + | ... + +ok, err = c:call('check_ref', {2}) + | --- + | ... +assert(ok == nil and err.message) + | --- + | - 'Can not use a storage ref: no ref' + | ... +_ = test_run:switch('storage_2_a') + | --- + | ... +assert(lref.count == 1) + | --- + | - true + | ... +_ = test_run:switch('storage_1_a') + | --- + | ... + -- Use works. c:call('use_ref', {1}) | --- diff --git a/test/storage/ref.test.lua b/test/storage/ref.test.lua index 17264579..76c67a33 100644 --- a/test/storage/ref.test.lua +++ b/test/storage/ref.test.lua @@ -106,6 +106,9 @@ small_timeout = 0.001 function make_ref(rid, timeout) \ return lref.add(rid, box.session.id(), timeout) \ end +function check_ref(rid) \ + return lref.check(rid, box.session.id()) \ +end function use_ref(rid) \ return lref.use(rid, box.session.id()) \ end @@ -124,6 +127,19 @@ _ = test_run:switch('storage_2_a') assert(lref.count == 1) _ = test_run:switch('storage_1_a') +-- Check works. +ok, err = c:call('check_ref', {1}) +assert(ok and not err) +_ = test_run:switch('storage_2_a') +assert(lref.count == 1) +_ = test_run:switch('storage_1_a') + +ok, err = c:call('check_ref', {2}) +assert(ok == nil and err.message) +_ = test_run:switch('storage_2_a') +assert(lref.count == 1) +_ = test_run:switch('storage_1_a') + -- Use works. c:call('use_ref', {1}) _ = test_run:switch('storage_2_a') diff --git a/vshard/storage/ref.lua b/vshard/storage/ref.lua index df7af092..0b517819 100644 --- a/vshard/storage/ref.lua +++ b/vshard/storage/ref.lua @@ -247,6 +247,14 @@ local function ref_session_new(sid) return true end + local function ref_session_check(self, rid) + local ref = ref_map[rid] + if not ref then + return nil, lerror.vshard(lerror.code.STORAGE_REF_USE, 'no ref') + end + return true + end + -- -- Ref use means it can't be expired until deleted explicitly. Should be -- done when the request affecting the whole storage starts. After use it is @@ -301,6 +309,7 @@ local function ref_session_new(sid) del = ref_session_del, gc = ref_session_gc, add = ref_session_add, + check = ref_session_check, use = ref_session_use, kill = ref_session_kill, } @@ -364,6 +373,14 @@ local function ref_add(rid, sid, timeout) return nil, err end +local function ref_check(rid, sid) + local session = M.session_map[sid] + if not session then + return nil, lerror.vshard(lerror.code.STORAGE_REF_USE, 'no session') + end + return session:check(rid) +end + local function ref_use(rid, sid) local session = M.session_map[sid] if not session then @@ -407,6 +424,7 @@ end M.del = ref_del M.gc = ref_gc M.add = ref_add +M.check = ref_check M.use = ref_use M.cfg = ref_cfg M.kill = ref_kill_session From b7436019d44f112d3db5ba877948b10e91a1e1d8 Mon Sep 17 00:00:00 2001 From: Denis Smirnov Date: Fri, 24 Nov 2023 12:14:42 +0700 Subject: [PATCH 05/17] storage: implement partial map-reduce API The storage-size of the Partial Map-Reduce feature. NO_DOC=later --- test/instances/storage.lua | 17 ++++ test/storage-luatest/storage_1_test.lua | 114 ++++++++++++++++++++++++ test/upgrade/upgrade.result | 2 + vshard/storage/init.lua | 66 ++++++++++++++ 4 files changed, 199 insertions(+) diff --git a/test/instances/storage.lua b/test/instances/storage.lua index 4bbff541..95cd4a7a 100755 --- a/test/instances/storage.lua +++ b/test/instances/storage.lua @@ -13,6 +13,7 @@ _G.ivconst = require('vshard.consts') _G.ivutil = require('vshard.util') _G.iverror = require('vshard.error') _G.ivtest = require('test.luatest_helpers.vtest') +_G.itable_new = require('table.new') _G.iwait_timeout = _G.ivtest.wait_timeout @@ -69,6 +70,21 @@ local function get_first_bucket() return res ~= nil and res.id or nil end +local function get_n_buckets(n) + if n <= 0 then + error('Invalid number of buckets') + end + local index = box.space._bucket.index.status + local ids = _G.itable_new(0, n) + for _, tuple in index:pairs(vconst.BUCKET.ACTIVE) do + table.insert(ids, tuple.id) + if #ids == n then + return ids + end + end + error('Not enough buckets') +end + local function session_set(key, value) box.session.storage[key] = value return true @@ -167,6 +183,7 @@ _G.box_error = box_error _G.echo = echo _G.get_uuid = get_uuid _G.get_first_bucket = get_first_bucket +_G.get_n_buckets = get_n_buckets _G.session_set = session_set _G.session_get = session_get _G.bucket_gc_wait = bucket_gc_wait diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua index 0d22df20..2e27f811 100644 --- a/test/storage-luatest/storage_1_test.lua +++ b/test/storage-luatest/storage_1_test.lua @@ -225,3 +225,117 @@ test_group.test_bucket_skip_validate = function(g) internal.errinj.ERRINJ_SKIP_BUCKET_STATUS_VALIDATE = false end) end + +test_group.test_ref_with_lookup = function(g) + g.replica_1_a:exec(function() + local res, err, _ + local timeout = 0.1 + local rid = 42 + local bids = _G.get_n_buckets(2) + local bid_extra = 3001 + + -- Check for a single bucket. + res, err = ivshard.storage._call( + 'storage_ref_with_lookup', + rid, + timeout, + {bids[1]} + ) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {rid = rid, moved = {}}) + _, err = ivshard.storage._call('storage_unref', rid) + ilt.assert_equals(err, nil) + + -- Check for multiple buckets. + res, err = ivshard.storage._call( + 'storage_ref_with_lookup', + rid, + timeout, + {bids[1], bids[2]} + ) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {rid = rid, moved = {}}) + _, err = ivshard.storage._call('storage_unref', rid) + ilt.assert_equals(err, nil) + + -- Check for double referencing. + res, err = ivshard.storage._call( + 'storage_ref_with_lookup', + rid, + timeout, + {bids[1], bids[1]} + ) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {rid = rid, moved = {}}) + _, err = ivshard.storage._call('storage_unref', rid) + ilt.assert_equals(err, nil) + _, err = ivshard.storage._call('storage_unref', rid) + t.assert_str_contains(err.message, + 'Can not delete a storage ref: no ref') + + -- Check for an absent bucket. + res, err = ivshard.storage._call( + 'storage_ref_with_lookup', + rid, + timeout, + {bids[1], bids[2], bid_extra} + ) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {rid = rid, moved = {bid_extra}}) + ivshard.storage._call('storage_unref', rid) + + -- Check that we do not create a reference if there are no buckets. + res, err = ivshard.storage._call( + 'storage_ref_with_lookup', + rid, + timeout, + {bid_extra} + ) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {rid = nil, moved = {bid_extra}}) + res, err = ivshard.storage._call('storage_unref', rid) + t.assert_str_contains(err.message, + 'Can not delete a storage ref: no ref') + ilt.assert_equals(res, nil) + + -- Check for a timeout. + -- Emulate a case when all buckets are not writable. + local func = ivshard.storage.internal.bucket_are_all_rw + ivshard.storage.internal.bucket_are_all_rw = function() return false end + res, err = ivshard.storage._call( + 'storage_ref_with_lookup', + rid, + timeout, + {bids[1]} + ) + ivshard.storage.internal.bucket_are_all_rw = func + t.assert_str_contains(err.message, 'Timeout exceeded') + ilt.assert_equals(res, nil) + -- Check that the reference was not created. + res, err = ivshard.storage._call('storage_unref', rid) + t.assert_str_contains(err.message, + 'Can not delete a storage ref: no ref') + ilt.assert_equals(res, nil) + end) +end + +test_group.test_absent_buckets = function(g) + g.replica_1_a:exec(function() + local res, err = ivshard.storage._call( + 'storage_absent_buckets', + {_G.get_first_bucket()} + ) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {}) + end) + + g.replica_1_a:exec(function() + local bid_extra = 3001 + local res, err = ivshard.storage._call( + 'storage_absent_buckets', + {_G.get_first_bucket(), bid_extra} + ) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {bid_extra}) + end) +end diff --git a/test/upgrade/upgrade.result b/test/upgrade/upgrade.result index 5f4f0582..c5b99ba0 100644 --- a/test/upgrade/upgrade.result +++ b/test/upgrade/upgrade.result @@ -177,8 +177,10 @@ vshard.storage._call('test_api', 1, 2, 3) | - rebalancer_apply_routes | - rebalancer_request_state | - recovery_bucket_stat + | - storage_absent_buckets | - storage_map | - storage_ref + | - storage_ref_with_lookup | - storage_unref | - test_api | - 1 diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index f6809a0d..2fbfba27 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -3180,6 +3180,70 @@ local function storage_ref(rid, timeout) return bucket_count() end +-- +-- Lookup for absent active buckets. +-- +-- @param bucket_ids List of bucket identifiers. +-- @return A dictionary with a list of bucket identifiers which are +-- not present on the current instance. +-- +local function storage_absent_buckets(bucket_ids) + local status = consts.BUCKET + local moved_buckets = {} + for _, bucket_id in pairs(bucket_ids) do + local bucket = box.space._bucket:get{bucket_id} + if not bucket or bucket.status ~= status.ACTIVE then + table.insert(moved_buckets, bucket_id) + end + end + return { moved = moved_buckets } +end + +-- +-- Bind a new storage ref to the current box session and check +-- that all the buckets are present. Is used as a part of the +-- partial Map-Reduce API. +-- +-- @param rid Unique reference ID. +-- @param timeout Seconds. +-- @param bucket_ids List of bucket identifiers. +-- @return A dictionary with reference id and a list of bucket identifiers +-- which are not present on the current instance. If all the buckets +-- are absent, the reference is not created and a nil reference id +-- with the list of absent buckets is returned. +-- +local function storage_ref_with_lookup(rid, timeout, bucket_ids) + local moved = storage_absent_buckets(bucket_ids).moved + if #moved == #bucket_ids then + -- Take an advantage that moved buckets are returned in the same + -- order as in the input list. + local do_match = true + local next_moved = next(moved) + local next_passed = next(bucket_ids) + ::continue:: + if next_moved then + if next_moved == next_passed then + next_moved = next(moved, next_moved) + next_passed = next(bucket_ids, next_passed) + goto continue + else + do_match = false + end + end + if do_match then + -- If all the passed buckets are absent, there is no need + -- to create a ref. + return {rid = nil, moved = moved} + end + end + + local ok, err = storage_ref(rid, timeout) + if not ok then + return nil, err + end + return {rid = rid, moved = moved} +end + -- -- Drop a storage ref from the current box session. Is used as a part of -- Map-Reduce API. @@ -3234,7 +3298,9 @@ service_call_api = setmetatable({ rebalancer_apply_routes = rebalancer_apply_routes, rebalancer_request_state = rebalancer_request_state, recovery_bucket_stat = recovery_bucket_stat, + storage_absent_buckets = storage_absent_buckets, storage_ref = storage_ref, + storage_ref_with_lookup = storage_ref_with_lookup, storage_unref = storage_unref, storage_map = storage_map, info = storage_service_info, From 87aefc7a39814395e79716d9f9880db82f8cc533 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Wed, 18 Sep 2024 19:43:52 +0200 Subject: [PATCH 06/17] Make the tests pass from prev commit The previous commit was failing some tests. Lets patch them up. That commit isn't amended so as to keep its original shape in respect to the external contributor. NO_DOC=bugfix --- test/instances/storage.lua | 3 +-- test/storage-luatest/storage_1_test.lua | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/test/instances/storage.lua b/test/instances/storage.lua index 95cd4a7a..964da042 100755 --- a/test/instances/storage.lua +++ b/test/instances/storage.lua @@ -13,7 +13,6 @@ _G.ivconst = require('vshard.consts') _G.ivutil = require('vshard.util') _G.iverror = require('vshard.error') _G.ivtest = require('test.luatest_helpers.vtest') -_G.itable_new = require('table.new') _G.iwait_timeout = _G.ivtest.wait_timeout @@ -75,7 +74,7 @@ local function get_n_buckets(n) error('Invalid number of buckets') end local index = box.space._bucket.index.status - local ids = _G.itable_new(0, n) + local ids = table.new(0, n) for _, tuple in index:pairs(vconst.BUCKET.ACTIVE) do table.insert(ids, tuple.id) if #ids == n then diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua index 2e27f811..9e760ebe 100644 --- a/test/storage-luatest/storage_1_test.lua +++ b/test/storage-luatest/storage_1_test.lua @@ -282,7 +282,8 @@ test_group.test_ref_with_lookup = function(g) ) ilt.assert_equals(err, nil) ilt.assert_equals(res, {rid = rid, moved = {bid_extra}}) - ivshard.storage._call('storage_unref', rid) + _, err = ivshard.storage._call('storage_unref', rid) + ilt.assert_equals(err, nil) -- Check that we do not create a reference if there are no buckets. res, err = ivshard.storage._call( @@ -326,7 +327,7 @@ test_group.test_absent_buckets = function(g) {_G.get_first_bucket()} ) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {}) + ilt.assert_equals(res, {moved = {}}) end) g.replica_1_a:exec(function() @@ -336,6 +337,6 @@ test_group.test_absent_buckets = function(g) {_G.get_first_bucket(), bid_extra} ) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {bid_extra}) + ilt.assert_equals(res, {moved = {bid_extra}}) end) end From 5ab5b4c7eb6a577e758ece7524981a6d289466a6 Mon Sep 17 00:00:00 2001 From: Denis Smirnov Date: Fri, 24 Nov 2023 12:15:36 +0700 Subject: [PATCH 07/17] router: implement partial map-reduce API Introduce a partial ref-map-reduce API for vshard. It guarantees that in case of success the function is executed exactly once on the storages, that contain the given list of buckets. NO_DOC=later --- test/luatest_helpers/vtest.lua | 10 + test/router-luatest/map_part_test.lua | 300 ++++++++++++++++++++++++++ test/router-luatest/router_test.lua | 19 ++ vshard/router/init.lua | 235 ++++++++++++++++++++ 4 files changed, 564 insertions(+) create mode 100644 test/router-luatest/map_part_test.lua diff --git a/test/luatest_helpers/vtest.lua b/test/luatest_helpers/vtest.lua index 1a751532..5350140d 100644 --- a/test/luatest_helpers/vtest.lua +++ b/test/luatest_helpers/vtest.lua @@ -466,6 +466,15 @@ local function storage_first_bucket(storage) end) end +-- +-- Get n active buckets from the storage. +-- +local function storage_get_n_buckets(storage, n) + return storage:exec(function(n) + return _G.get_n_buckets(n) + end, {n}) +end + -- -- Disable rebalancer on all storages. -- @@ -854,6 +863,7 @@ return { cluster_wait_fullsync = cluster_wait_fullsync, cluster_rebalancer_find = cluster_rebalancer_find, storage_first_bucket = storage_first_bucket, + storage_get_n_buckets = storage_get_n_buckets, storage_stop = storage_stop, storage_start = storage_start, router_new = router_new, diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua new file mode 100644 index 00000000..8a9dc3d3 --- /dev/null +++ b/test/router-luatest/map_part_test.lua @@ -0,0 +1,300 @@ + +local t = require('luatest') +local vtest = require('test.luatest_helpers.vtest') + +local g = t.group('router') +local cfg_template = { + sharding = { + { + replicas = { + replica_1_a = { + master = true, + }, + replica_1_b = {}, + }, + }, + { + replicas = { + replica_2_a = { + master = true, + }, + replica_2_b = {}, + }, + }, + }, + bucket_count = 100, + test_user_grant_range = 'super', +} +local global_cfg = vtest.config_new(cfg_template) + +g.before_all(function() + vtest.cluster_new(g, global_cfg) + + t.assert_equals(g.replica_1_a:exec(function() + return #ivshard.storage.info().alerts + end), 0, 'no alerts after boot') + + local router = vtest.router_new(g, 'router', global_cfg) + g.router = router + local res, err = router:exec(function() + return ivshard.router.bootstrap({timeout = iwait_timeout}) + end) + t.assert(res and not err, 'bootstrap buckets') +end) + +g.after_all(function() + g.cluster:drop() +end) + +local function map_part_init() + local rs1_uuid = g.replica_1_a:replicaset_uuid() + local rs2_uuid = g.replica_2_a:replicaset_uuid() + + local create_map_func_f = function(res1) + rawset(_G, 'do_map', function(res2) + return {res1, res2} + end) + end + g.replica_1_a:exec(create_map_func_f, {1}) + g.replica_2_a:exec(create_map_func_f, {2}) + + local bids1 = vtest.storage_get_n_buckets(g.replica_1_a, 4) + local bids2 = vtest.storage_get_n_buckets(g.replica_2_a, 1) + + return { + rs1_uuid = rs1_uuid, + rs2_uuid = rs2_uuid, + bids1 = bids1, + bids2 = bids2, + } +end + +g.test_map_part_single_rs = function(g) + local expected, res + local init = map_part_init() + + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1, bid2}, + 'do_map', + {3}, + {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[3], init.bids1[2]}) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + expected = { + [init.rs1_uuid] = {{1, 3}}, + } + t.assert_equals(res.val, expected) +end + +g.test_map_part_multi_rs = function(g) + local expected, res + local init = map_part_init() + + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[1], init.bids2[1]}) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + expected = { + [init.rs1_uuid] = {{1, 42}}, + [init.rs2_uuid] = {{2, 42}}, + } + t.assert_equals(res.val, expected) +end + +g.test_map_part_ref = function(g) + local expected, res + local init = map_part_init() + + -- First move some buckets from rs1 to rs2 and then pause gc on rs1. + -- As a result, the buckets will be in the SENT state on rs1 and + -- in the ACTIVE state on rs2. + g.replica_1_a:exec(function(bid1, bid2, to) + ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true + ivshard.storage.bucket_send(bid1, to) + ivshard.storage.bucket_send(bid2, to) + end, {init.bids1[1], init.bids1[2], init.rs2_uuid}) + -- The buckets are ACTIVE on rs2, so the partial map should succeed. + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[1], init.bids1[2]}) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + expected = { + [init.rs2_uuid] = {{2, 42}}, + } + t.assert_equals(res.val, expected) + -- But if we use some active bucket from rs1, the partial map should fail. + -- The reason is that the moved buckets are still in the SENT state and + -- we can't take a ref. + res = g.router:exec(function(bid1) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1}, 'do_map', {42}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[3]}) + t.assert_equals(res.val, nil) + t.assert(res.err) + t.assert_equals(res.err_uuid, init.rs1_uuid) + -- The moved buckets still exist on the rs1 with non-active status. + -- Let's remove them and re-enable gc on rs1. + g.replica_1_a:exec(function() + ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false + _G.bucket_gc_wait() + end) + -- Now move the buckets back to rs1 and pause gc on rs2. + -- The buckets will be ACTIVE on rs1 and SENT on rs2, + -- so the partial map should succeed. + res = g.replica_2_a:exec(function(bid1, bid2, to) + ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true + ivshard.storage.bucket_send(bid1, to) + local res, err = ivshard.storage.bucket_send(bid2, to) + return { + res = res, + err = err, + } + end, {init.bids1[1], init.bids1[2], init.rs1_uuid}) + t.assert_equals(res.err, nil) + + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[1], init.bids1[2]}) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + expected = { + [init.rs1_uuid] = {{1, 42}}, + } + t.assert_equals(res.val, expected) + -- Re-enable gc on rs2. + g.replica_2_a:exec(function() + ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false + _G.bucket_gc_wait() + end) +end + +g.test_map_part_double_ref = function(g) + local expected, res + local init = map_part_init() + + -- First, disable discovery on the router to disable route cache update. + g.router:exec(function() + ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true + end) + -- Then, move the bucket form rs1 to rs2. Now the router has an outdated + -- route cache. + g.replica_1_a:exec(function(bid, to) + ivshard.storage.bucket_send(bid, to) + end, {init.bids1[4], init.rs2_uuid}) + -- Call a partial map for the moved bucket and some bucket from rs2. The ref + -- stage should be done in two steps: + -- 1. ref rs2 and returns the moved bucket; + -- 2. discover the moved bucket on rs2 and avoid double reference; + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[4], init.bids2[1]}) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + expected = { + [init.rs2_uuid] = {{2, 42}}, + } + t.assert_equals(res.val, expected) + -- Call a partial map one more time to make sure there are no references + -- left. + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[4], init.bids2[1]}) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + t.assert_equals(res.val, expected) + -- Return the bucket back and re-enable discovery on the router. + g.replica_2_a:exec(function(bid, to) + ivshard.storage.bucket_send(bid, to) + end, {init.bids1[4], init.rs1_uuid}) + g.router:exec(function() + ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false + end) +end + +g.test_map_part_map = function(g) + local res + local init = map_part_init() + + g.replica_2_a:exec(function() + _G.do_map = function() + return box.error(box.error.PROC_LUA, "map_err") + end + end) + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid2, bid1}, 'do_map', {3}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[1], init.bids2[1]}) + t.assert_equals(res.val, nil) + t.assert_covers(res.err, { + code = box.error.PROC_LUA, + type = 'ClientError', + message = 'map_err' + }) + t.assert_equals(res.err_uuid, init.rs2_uuid) + -- Check that there is no dangling references after the error. + init = map_part_init() + res = g.router:exec(function(bid1, bid2) + local val, err, err_uuid = ivshard.router.map_part_callrw( + {bid1, bid2}, 'do_map', {3}, {timeout = iwait_timeout}) + return { + val = val, + err = err, + err_uuid = err_uuid, + } + end, {init.bids1[1], init.bids2[1]}) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + t.assert_equals(res.val, { + [init.rs1_uuid] = {{1, 3}}, + [init.rs2_uuid] = {{2, 3}}, + }) +end diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua index b2e12951..0938b600 100644 --- a/test/router-luatest/router_test.lua +++ b/test/router-luatest/router_test.lua @@ -286,6 +286,25 @@ g.test_map_callrw_raw = function(g) end) end +g.test_group_buckets = function(g) + local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2) + + local res = g.router:exec(function(bid1, bid2) + local val, err = ivshard.router.group({bid2, bid1, bid1}) + return { + val = val, + err = err, + } + end, {bids[1], bids[2]}) + assert(res.err == nil) + local rs1_uuid = g.replica_1_a:replicaset_uuid() + local expected = { + [rs1_uuid] = {bids[1], bids[2]}, + } + table.sort(expected[rs1_uuid]) + t.assert_equals(res.val, expected) +end + g.test_uri_compare_and_reuse = function(g) -- Multilisten itself is not used, but URI-table is supported since the same -- version. diff --git a/vshard/router/init.lua b/vshard/router/init.lua index f16fb69e..97169d38 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -223,6 +223,49 @@ local function bucket_resolve(router, bucket_id) return replicaset end +-- Group bucket ids by replicasets according to the router cache. +local function buckets_group(router, bucket_ids, timeout) + timeout = timeout or consts.CALL_TIMEOUT_MIN + local deadline = fiber_clock() + timeout + local replicaset, err + local replicaset_buckets = {} + local prev_id = nil + local buckets, id + + -- Sort buckets to skip duplicates. + table.sort(bucket_ids) + for _, bucket_id in pairs(bucket_ids) do + if bucket_id == prev_id then + goto continue + end + + if fiber_clock() > deadline then + return nil, lerror.timeout() + end + + replicaset, err = bucket_resolve(router, bucket_id) + if err ~= nil then + return nil, err + end + if replicaset == nil then + return nil, lerror.vshard(lerror.code.NO_ROUTE_TO_BUCKET, bucket_id) + end + + id = replicaset.id + buckets = replicaset_buckets[id] + if buckets then + table.insert(buckets, bucket_id) + else + replicaset_buckets[id] = {bucket_id} + end + + ::continue:: + prev_id = bucket_id + end + + return replicaset_buckets +end + -- -- Arrange downloaded buckets to the route map so as they -- reference a given replicaset. @@ -972,6 +1015,197 @@ local function router_map_callrw(router, func, args, opts) return nil, err, err_id end +-- +-- Partial Ref-Map-Reduce. The given function is called on the masters +-- with a guarantee that in case of success it was executed exactly once +-- on all storages that contain the given buckets. Execution on masters +-- is chosen to allow buckets be accessed for reads and writes during the +-- call. +-- +-- The execution consists of the following stages: +-- +-- Ref stage. We ref all the storages with the given buckets. It is +-- an iterative process, because the router's cache may be outdated, +-- and we need to discover the moved buckets. At the end of this stage +-- we have refed all the storages containing the given buckets with the +-- given timeout. +-- +-- Map stage. We call the given function on all the storages we have +-- refed. On the storage side we use a `storage_map` function, which +-- switches the timeout ref to the manual mode and unrefs the storage +-- after the function is called. +-- +-- Reduce stage. We collect the results of the function call. +-- +-- @param router Instance to use. +-- @param bucket_ids List of bucket identifiers. +-- @param func Name of the function to call. +-- @param args Function arguments passed in netbox style (as an array). +-- @param opts Can only contain 'timeout' as a number of seconds. Note that the +-- refs may end up being kept on the storages during this entire timeout if +-- something goes wrong. For instance, network issues appear. This means +-- better not use a value bigger than necessary. A stuck infinite ref can +-- only be dropped by this router restart/reconnect or the storage restart. +-- +-- @return In case of success - a map with replicaset UUID keys and values being +-- what the function returned from the replicaset. +-- +-- @return In case of an error - nil, error object, optional UUID of the +-- replicaset where the error happened. UUID may be not present if it wasn't +-- about concrete replicaset. For example, not all buckets were found even +-- though all replicasets were scanned. +-- +local function router_map_part_callrw(router, bucket_ids, func, args, opts) + local replicaset + local grouped_buckets + local err, err_id, res, ok, map + local call_args + local replicasets = {} + local preallocated = false + local futures = {} + local call_opts = {is_async = true} + local rs_count = 0 + local timeout = opts and opts.timeout or consts.CALL_TIMEOUT_MIN + local deadline = fiber_clock() + timeout + local rid = M.ref_id + M.ref_id = rid + 1 + + -- Nil checks are done explicitly here (== nil instead of 'not'), because + -- netbox requests return box.NULL instead of nils. + + -- Ref stage. + while #bucket_ids > 0 do + -- Group the buckets by replicasets according to the router cache. + grouped_buckets, err = buckets_group(router, bucket_ids, timeout) + if grouped_buckets == nil then + goto fail + end + timeout = deadline - fiber_clock() + + -- Send ref requests with timeouts to the replicasets. + futures = table_new(0, #grouped_buckets) + for id, buckets in pairs(grouped_buckets) do + replicaset = router.replicasets[id] + -- Netbox async requests work only with active connections. Need to + -- wait for the connection explicitly. + timeout, err = replicaset:wait_connected(timeout) + if timeout == nil then + err_id = id + goto fail + end + if replicasets[id] then + -- Replicaset is already referred on the previous iteration. + -- Simply get the moved buckets without double referencing. + call_args = {'storage_absent_buckets', buckets} + else + call_args = {'storage_ref_with_lookup', rid, timeout, buckets} + rs_count = rs_count + 1 + end + res, err = replicaset:callrw('vshard.storage._call', call_args, + call_opts) + if res == nil then + err_id = id + goto fail + end + futures[id] = res + end + + if not preallocated then + -- Current preallocation works only for the first iteration + -- (i.e. router cache is not outdated). + replicasets = table_new(0, rs_count) + preallocated = true + end + + -- Wait for the refs to be done and collect moved buckets. + bucket_ids = {} + for id, future in pairs(futures) do + res, err = future_wait(future, timeout) + -- Handle netbox error first. + if res == nil then + err_id = id + goto fail + end + -- Ref returns nil,err or {rid, moved}. + res, err = res[1], res[2] + if res == nil then + err_id = id + goto fail + end + for _, bucket_id in pairs(res.moved) do + bucket_reset(router, bucket_id) + table.insert(bucket_ids, bucket_id) + end + if res.rid then + -- If there are no buckets on the replicaset, + -- it would not be referred. + replicasets[id] = router.replicasets[id] + end + timeout = deadline - fiber_clock() + end + end + + -- Map stage. + map = table_new(0, rs_count) + futures = table_new(0, rs_count) + args = {'storage_map', rid, func, args} + -- Send map requests. + for id, rs in pairs(replicasets) do + res, err = rs:callrw('vshard.storage._call', args, call_opts) + if res == nil then + err_id = id + goto fail + end + futures[id] = res + end + + -- Reduce stage. + -- Collect map responses (refs were deleted by the storages for non-error + -- results). + for id, future in pairs(futures) do + res, err = future_wait(future, timeout) + if res == nil then + err_id = id + goto fail + end + -- Map returns true,res or nil,err. + ok, res = res[1], res[2] + if ok == nil then + err = res + err_id = id + goto fail + end + if res ~= nil then + -- Store as a table so in future it could be extended for + -- multireturn. + map[id] = {res} + end + timeout = deadline - fiber_clock() + end + do return map end + +::fail:: + local f + for id, future in pairs(futures) do + future:discard() + -- Best effort to remove the created refs before exiting. Can help if + -- the timeout was big and the error happened early. + call_args = {'storage_unref', rid} + replicaset = router.replicasets[id] + if replicaset then + f = replicaset:callrw('vshard.storage._call', call_args, call_opts) + if f ~= nil then + -- Don't care waiting for a result - no time for this. But it + -- won't affect the request sending if the connection is still + -- alive. + f:discard() + end + end + end + err = lerror.make(err) + return nil, err, err_id +end + -- -- Get replicaset object by bucket identifier. -- @param bucket_id Bucket identifier. @@ -1817,6 +2051,7 @@ local router_mt = { callre = router_make_api(router_callre), callbre = router_make_api(router_callbre), map_callrw = router_make_api(router_map_callrw), + map_part_callrw = router_make_api(router_map_part_callrw), route = router_make_api(router_route), routeall = router_make_api(router_routeall), bucket_id = router_make_api(router_bucket_id), From 8c9529395cf9005328f65a75f37a2955450e4749 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Wed, 18 Sep 2024 19:57:02 +0200 Subject: [PATCH 08/17] Make the tests pass from prev commit The previous commit was failing some tests. Lets patch them up. That commit isn't amended so as to keep its original shape in respect to the external contributor. NO_DOC=bugfix --- test/router-luatest/router_test.lua | 2 +- vshard/router/init.lua | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua index 0938b600..f9cd6920 100644 --- a/test/router-luatest/router_test.lua +++ b/test/router-luatest/router_test.lua @@ -290,7 +290,7 @@ g.test_group_buckets = function(g) local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2) local res = g.router:exec(function(bid1, bid2) - local val, err = ivshard.router.group({bid2, bid1, bid1}) + local val, err = ivshard.router._buckets_group({bid2, bid1, bid1}) return { val = val, err = err, diff --git a/vshard/router/init.lua b/vshard/router/init.lua index 97169d38..ea4c5fd2 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -2066,6 +2066,7 @@ local router_mt = { discovery_set = router_make_api(discovery_set), _route_map_clear = router_make_api(route_map_clear), _bucket_reset = router_make_api(bucket_reset), + _buckets_group = router_make_api(buckets_group), disable = router_disable, enable = router_enable, } From ff8e8a0f1f58033445e0d2ae41b02435d08b6a2f Mon Sep 17 00:00:00 2001 From: Denis Smirnov Date: Thu, 30 Nov 2023 12:17:00 +0700 Subject: [PATCH 09/17] router: improve master connection parallelism in map-reduce This is useful for RW map-reduce requests which need to send multiple network requests in parallel to multiple masters. In-parallel means using is_async netbox feature. But it only works if the connection is already established. Which means that the connection establishment ideally must also be parallel. NO_DOC=internal NO_TEST=already covered --- vshard/replicaset.lua | 1 + vshard/router/init.lua | 61 +++++++++++++++++++++++++++++++++++++----- 2 files changed, 55 insertions(+), 7 deletions(-) diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua index abdffe67..086ed4ec 100644 --- a/vshard/replicaset.lua +++ b/vshard/replicaset.lua @@ -1225,6 +1225,7 @@ local replicaset_mt = { up_replica_priority = replicaset_up_replica_priority; wait_connected = replicaset_wait_connected, wait_connected_all = replicaset_wait_connected_all, + wait_master = replicaset_wait_master, call = replicaset_master_call; callrw = replicaset_master_call; callro = replicaset_template_multicallro(false, false); diff --git a/vshard/router/init.lua b/vshard/router/init.lua index ea4c5fd2..48bd9f4e 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -804,6 +804,37 @@ local function router_call(router, bucket_id, opts, ...) ...) end +local function wait_connected_to_masters(replicasets, timeout) + local master, id + local err, err_id + + -- Start connecting to all masters in parallel. + local deadline = fiber_clock() + timeout + for _, replicaset in pairs(replicasets) do + master, err = replicaset:wait_master(timeout) + id = replicaset.id + if not master then + err_id = id + goto fail + end + replicaset:connect_replica(master) + timeout = deadline - fiber_clock() + end + + -- Wait until all connections are established. + for _, replicaset in pairs(replicasets) do + timeout, err = replicaset:wait_connected(timeout) + if not timeout then + err_id = replicaset.id + goto fail + end + end + do return timeout end + + ::fail:: + return nil, err, err_id +end + -- -- Perform map-reduce stages on the given set of replicasets. The map expects a -- valid ref ID which must be done beforehand. @@ -956,14 +987,17 @@ local function router_map_callrw(router, func, args, opts) -- -- Ref stage: send. -- + -- Netbox async requests work only with active connections. Need to wait + -- for the connection explicitly. + local rs_list = table_new(0, #replicasets) + for _, rs in pairs(replicasets) do + table.insert(rs_list, rs) + end + timeout, err, err_id = wait_connected_to_masters(rs_list, timeout) + if not timeout then + goto fail + end for id, rs in pairs(replicasets) do - -- Netbox async requests work only with active connections. Need to wait - -- for the connection explicitly. - timeout, err = rs:wait_connected(timeout) - if timeout == nil then - err_id = id - goto fail - end res, err = rs:callrw('vshard.storage._call', {'storage_ref', rid, timeout}, opts_async) if res == nil then @@ -1060,6 +1094,7 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts) local grouped_buckets local err, err_id, res, ok, map local call_args + local rs_list local replicasets = {} local preallocated = false local futures = {} @@ -1082,6 +1117,18 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts) end timeout = deadline - fiber_clock() + -- Netbox async requests work only with active connections. + -- So, first need to wait for the master connection explicitly. + rs_list = table_new(0, #grouped_buckets) + for uuid, _ in pairs(grouped_buckets) do + replicaset = router.replicasets[uuid] + table.insert(rs_list, replicaset) + end + timeout, err, err_id = wait_connected_to_masters(rs_list, timeout) + if not timeout then + goto fail + end + -- Send ref requests with timeouts to the replicasets. futures = table_new(0, #grouped_buckets) for id, buckets in pairs(grouped_buckets) do From 7ff139a0c9685bd0de9f483c6b6664900531ae45 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Wed, 18 Sep 2024 20:05:42 +0200 Subject: [PATCH 10/17] Review fixes for Partial Map-Reduce There were a number of minor issues with the previous several commits, like the tests running way too long or some cases not being covered or the code being non-critically suboptimal. Lets fix them all. The original commits aren't amended so as to keep their original shape in respect to the external contributor. NO_DOC=bugfix --- test/router-luatest/map_part_test.lua | 489 ++++++++++++++---------- test/router-luatest/router_test.lua | 17 +- test/router/router.result | 1 + test/storage-luatest/storage_1_test.lua | 115 +++--- test/upgrade/upgrade.result | 4 +- vshard/replicaset.lua | 30 +- vshard/router/init.lua | 193 +++------- vshard/storage/init.lua | 34 +- 8 files changed, 429 insertions(+), 454 deletions(-) diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua index 8a9dc3d3..1a606b2c 100644 --- a/test/router-luatest/map_part_test.lua +++ b/test/router-luatest/map_part_test.lua @@ -1,6 +1,7 @@ - +local fiber = require('fiber') local t = require('luatest') local vtest = require('test.luatest_helpers.vtest') +local vutil = require('vshard.util') local g = t.group('router') local cfg_template = { @@ -22,279 +23,369 @@ local cfg_template = { }, }, }, - bucket_count = 100, + bucket_count = 10, test_user_grant_range = 'super', } local global_cfg = vtest.config_new(cfg_template) -g.before_all(function() - vtest.cluster_new(g, global_cfg) - - t.assert_equals(g.replica_1_a:exec(function() +g.before_all(function(cg) + vtest.cluster_new(cg, global_cfg) + t.assert_equals(cg.replica_1_a:exec(function() return #ivshard.storage.info().alerts end), 0, 'no alerts after boot') - - local router = vtest.router_new(g, 'router', global_cfg) - g.router = router + local _ + local router = vtest.router_new(cg, 'router', global_cfg) + cg.router = router local res, err = router:exec(function() - return ivshard.router.bootstrap({timeout = iwait_timeout}) + local res, err = ivshard.router.bootstrap({timeout = iwait_timeout}) + rawset(_G, 'do_map', function(bids, args, opts) + local old_bids = table.copy(bids) + local val, err, err_uuid = ivshard.router.map_part_callrw( + bids, 'do_map', args, opts) + -- Make sure the bucket list isn't changed by vshard. + ilt.assert_equals(bids, old_bids) + local val_type + if opts.return_raw and val ~= nil then + -- Src+value. The src is plain Lua data. The value is raw. + local _, one_map = next(val) + val_type = type(one_map) + else + val_type = type(val) + end + return { + val = val, + val_type = val_type, + err = err, + err_uuid = err_uuid, + } + end) + return res, err end) t.assert(res and not err, 'bootstrap buckets') + _, err = vtest.cluster_exec_each(cg, function() + rawset(_G, 'do_map', function(res) + ilt.assert_gt(require('vshard.storage.ref').count, 0) + return {ivutil.replicaset_uuid(), res} + end) + rawset(_G, 'bucket_send', function(bid, dst) + local _, err = ivshard.storage.bucket_send( + bid, dst, {timeout = iwait_timeout}) + ilt.assert_equals(err, nil) + end) + end) + t.assert_equals(err, nil) + cg.rs1_uuid = cg.replica_1_a:replicaset_uuid() + cg.rs2_uuid = cg.replica_2_a:replicaset_uuid() end) -g.after_all(function() - g.cluster:drop() +g.after_all(function(cg) + cg.cluster:drop() end) -local function map_part_init() - local rs1_uuid = g.replica_1_a:replicaset_uuid() - local rs2_uuid = g.replica_2_a:replicaset_uuid() - - local create_map_func_f = function(res1) - rawset(_G, 'do_map', function(res2) - return {res1, res2} - end) - end - g.replica_1_a:exec(create_map_func_f, {1}) - g.replica_2_a:exec(create_map_func_f, {2}) - - local bids1 = vtest.storage_get_n_buckets(g.replica_1_a, 4) - local bids2 = vtest.storage_get_n_buckets(g.replica_2_a, 1) - - return { - rs1_uuid = rs1_uuid, - rs2_uuid = rs2_uuid, - bids1 = bids1, - bids2 = bids2, - } +local function router_do_map(router, bids, args, opts) + return router:exec(function(bids, args, opts) + return _G.do_map(bids, args, opts) + end, {bids, args, opts}) end -g.test_map_part_single_rs = function(g) - local expected, res - local init = map_part_init() - - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1, bid2}, - 'do_map', - {3}, - {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[3], init.bids1[2]}) +g.test_map_part_single_rs = function(cg) + local bids = vtest.storage_get_n_buckets(cg.replica_1_a, 4) + local res = router_do_map(cg.router, {bids[3], bids[2]}, + {123}, {timeout = vtest.wait_timeout}) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) - expected = { - [init.rs1_uuid] = {{1, 3}}, - } - t.assert_equals(res.val, expected) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid, 123}}, + }) end -g.test_map_part_multi_rs = function(g) - local expected, res - local init = map_part_init() - - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[1], init.bids2[1]}) +g.test_map_part_multi_rs = function(cg) + local bid1 = vtest.storage_first_bucket(cg.replica_1_a) + local bid2 = vtest.storage_first_bucket(cg.replica_2_a) + local res = router_do_map(cg.router, {bid1, bid2}, + {123}, {timeout = vtest.wait_timeout}) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) - expected = { - [init.rs1_uuid] = {{1, 42}}, - [init.rs2_uuid] = {{2, 42}}, - } - t.assert_equals(res.val, expected) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid, 123}}, + [cg.rs2_uuid] = {{cg.rs2_uuid, 123}}, + }) end -g.test_map_part_ref = function(g) - local expected, res - local init = map_part_init() - +g.test_map_part_ref = function(cg) -- First move some buckets from rs1 to rs2 and then pause gc on rs1. -- As a result, the buckets will be in the SENT state on rs1 and -- in the ACTIVE state on rs2. - g.replica_1_a:exec(function(bid1, bid2, to) - ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true - ivshard.storage.bucket_send(bid1, to) - ivshard.storage.bucket_send(bid2, to) - end, {init.bids1[1], init.bids1[2], init.rs2_uuid}) + local bids1 = vtest.storage_get_n_buckets(cg.replica_1_a, 3) + cg.replica_1_a:exec(function(bid1, bid2, to) + _G.bucket_gc_pause() + _G.bucket_send(bid1, to) + _G.bucket_send(bid2, to) + end, {bids1[1], bids1[2], cg.rs2_uuid}) -- The buckets are ACTIVE on rs2, so the partial map should succeed. - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[1], init.bids1[2]}) + local res = router_do_map(cg.router, {bids1[1], bids1[2]}, + {42}, {timeout = vtest.wait_timeout}) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) - expected = { - [init.rs2_uuid] = {{2, 42}}, - } - t.assert_equals(res.val, expected) + t.assert_equals(res.val, { + [cg.rs2_uuid] = {{cg.rs2_uuid, 42}}, + }) -- But if we use some active bucket from rs1, the partial map should fail. -- The reason is that the moved buckets are still in the SENT state and -- we can't take a ref. - res = g.router:exec(function(bid1) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1}, 'do_map', {42}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[3]}) + res = router_do_map(cg.router, {bids1[3]}, {42}, {timeout = 0.1}) t.assert_equals(res.val, nil) t.assert(res.err) - t.assert_equals(res.err_uuid, init.rs1_uuid) + t.assert_equals(res.err_uuid, cg.rs1_uuid) -- The moved buckets still exist on the rs1 with non-active status. -- Let's remove them and re-enable gc on rs1. - g.replica_1_a:exec(function() - ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false + cg.replica_1_a:exec(function() + _G.bucket_gc_continue() _G.bucket_gc_wait() end) -- Now move the buckets back to rs1 and pause gc on rs2. -- The buckets will be ACTIVE on rs1 and SENT on rs2, -- so the partial map should succeed. - res = g.replica_2_a:exec(function(bid1, bid2, to) - ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true - ivshard.storage.bucket_send(bid1, to) - local res, err = ivshard.storage.bucket_send(bid2, to) - return { - res = res, - err = err, - } - end, {init.bids1[1], init.bids1[2], init.rs1_uuid}) - t.assert_equals(res.err, nil) + cg.replica_2_a:exec(function(bid1, bid2, to) + _G.bucket_gc_pause() + _G.bucket_send(bid1, to) + _G.bucket_send(bid2, to) + end, {bids1[1], bids1[2], cg.rs1_uuid}) - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[1], init.bids1[2]}) + res = router_do_map(cg.router, {bids1[1], bids1[2]}, + {42}, {timeout = vtest.wait_timeout}) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) - expected = { - [init.rs1_uuid] = {{1, 42}}, - } - t.assert_equals(res.val, expected) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid, 42}}, + }) -- Re-enable gc on rs2. - g.replica_2_a:exec(function() - ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false + cg.replica_2_a:exec(function() + _G.bucket_gc_continue() _G.bucket_gc_wait() end) end -g.test_map_part_double_ref = function(g) - local expected, res - local init = map_part_init() - +g.test_map_part_double_ref = function(cg) + local bid1 = vtest.storage_first_bucket(cg.replica_1_a) + local bid2 = vtest.storage_first_bucket(cg.replica_2_a) -- First, disable discovery on the router to disable route cache update. - g.router:exec(function() + cg.router:exec(function(bid, uuid) ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true - end) + -- Make sure the location of the bucket is known. + local rs, err = ivshard.router.bucket_discovery(bid) + ilt.assert_equals(err, nil) + ilt.assert_equals(rs.uuid, uuid) + end, {bid1, cg.rs1_uuid}) -- Then, move the bucket form rs1 to rs2. Now the router has an outdated -- route cache. - g.replica_1_a:exec(function(bid, to) - ivshard.storage.bucket_send(bid, to) - end, {init.bids1[4], init.rs2_uuid}) - -- Call a partial map for the moved bucket and some bucket from rs2. The ref - -- stage should be done in two steps: + cg.replica_1_a:exec(function(bid, to) + _G.bucket_send(bid, to) + _G.bucket_gc_wait() + end, {bid1, cg.rs2_uuid}) + -- Call a partial map for the moved bucket and some bucket + -- from rs2. The ref stage should be done in two steps: -- 1. ref rs2 and returns the moved bucket; -- 2. discover the moved bucket on rs2 and avoid double reference; - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[4], init.bids2[1]}) + local res = router_do_map(cg.router, {bid1, bid2}, + {42}, {timeout = vtest.wait_timeout}) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) - expected = { - [init.rs2_uuid] = {{2, 42}}, - } - t.assert_equals(res.val, expected) - -- Call a partial map one more time to make sure there are no references - -- left. - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[4], init.bids2[1]}) - t.assert_equals(res.err, nil) - t.assert_equals(res.err_uuid, nil) - t.assert_equals(res.val, expected) + t.assert_equals(res.val, { + [cg.rs2_uuid] = {{cg.rs2_uuid, 42}}, + }) + -- Make sure there are no references left. + local _, err = vtest.cluster_exec_each(cg, function() + ilt.assert_equals(require('vshard.storage.ref').count, 0) + end) + t.assert_equals(err, nil) -- Return the bucket back and re-enable discovery on the router. - g.replica_2_a:exec(function(bid, to) - ivshard.storage.bucket_send(bid, to) - end, {init.bids1[4], init.rs1_uuid}) - g.router:exec(function() + cg.replica_2_a:exec(function(bid, to) + _G.bucket_send(bid, to) + _G.bucket_gc_wait() + end, {bid1, cg.rs1_uuid}) + cg.router:exec(function() ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false end) end -g.test_map_part_map = function(g) - local res - local init = map_part_init() +g.test_map_part_ref_timeout = function(cg) + local bids = vtest.storage_get_n_buckets(cg.replica_1_a, 2) + local bid1 = bids[1] + local bid2 = bids[2] + + bids = vtest.storage_get_n_buckets(cg.replica_2_a, 2) + local bid3 = bids[1] + local bid4 = bids[2] + + -- First, disable discovery on the router to disable route cache update. + cg.router:exec(function(bids) + ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true + -- Make sure the location of the bucket is known. + for _, bid in pairs(bids) do + local _, err = ivshard.router.bucket_discovery(bid) + ilt.assert_equals(err, nil) + end + end, {{bid1, bid2, bid3, bid4}}) + + -- Send bucket so the router thinks: + -- rs1: {b1, b2}, rs2: {b3, b4} + -- and actually the state is: + -- rs1: {b1}, rs2: {b2, b3, b4} + cg.replica_1_a:exec(function(bid, to) + _G.bucket_send(bid, to) + _G.bucket_gc_wait() + end, {bid2, cg.rs2_uuid}) + + -- Partial map goes with the outdated mapping to the storages, successfully + -- refs rs1. Then gets a bit stuck in rs2. Rs1 ref in the meantime time is + -- lost. Due to restart or timeout or whatever. + cg.replica_2_a:exec(function() + local lref = require('vshard.storage.ref') + rawset(_G, 'old_ref_add', lref.add) + lref.add = function(rid, sid, ...) + ilt.assert_equals(rawget(_G, 'test_ref'), nil) + rawset(_G, 'test_ref', {rid = rid, sid = sid}) + local ok, err = _G.old_ref_add(rid, sid, ...) + ilt.helpers.retrying({timeout = iwait_timeout}, function() + if rawget(_G, 'test_ref') then + error('Test refs is not picked up') + end + end) + return ok, err + end + end) + local f = fiber.new(function() + return router_do_map(cg.router, {bid1, bid2, bid3, bid4}, + {42}, {timeout = vtest.wait_timeout}) + end) + f:set_joinable(true) + cg.replica_2_a:exec(function() + ilt.helpers.retrying({timeout = iwait_timeout}, function() + if not rawget(_G, 'test_ref') then + error('Test refs is not set') + end + end) + local lref = require('vshard.storage.ref') + local _, err = lref.del(_G.test_ref.rid, _G.test_ref.sid) + ilt.assert_equals(err, nil) + -- Cleanup. + lref.add = _G.old_ref_add + _G.old_ref_add = nil + _G.test_ref = nil + end) + + -- The whole request must fail now. + local ok, res = f:join() + t.assert(ok) + t.assert(res) + t.assert_not_equals(res.err, nil) + t.assert_equals(res.err_uuid, cg.rs2_uuid) + + -- Make sure there are no references left. + local _, err = vtest.cluster_exec_each(cg, function() + ilt.assert_equals(require('vshard.storage.ref').count, 0) + end) + t.assert_equals(err, nil) - g.replica_2_a:exec(function() + -- Return the bucket back and re-enable discovery on the router. + cg.replica_2_a:exec(function(bid, to) + _G.bucket_send(bid, to) + _G.bucket_gc_wait() + end, {bid2, cg.rs1_uuid}) + cg.router:exec(function() + ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false + end) +end + +g.test_map_part_map = function(cg) + local bid1 = vtest.storage_first_bucket(cg.replica_1_a) + local bid2 = vtest.storage_first_bucket(cg.replica_2_a) + cg.replica_2_a:exec(function() + rawset(_G, 'old_do_map', _G.do_map) _G.do_map = function() return box.error(box.error.PROC_LUA, "map_err") end end) - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid2, bid1}, 'do_map', {3}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[1], init.bids2[1]}) + local res = router_do_map(cg.router, {bid1, bid2}, + {3}, {timeout = vtest.wait_timeout}) t.assert_equals(res.val, nil) t.assert_covers(res.err, { code = box.error.PROC_LUA, type = 'ClientError', message = 'map_err' }) - t.assert_equals(res.err_uuid, init.rs2_uuid) + t.assert_equals(res.err_uuid, cg.rs2_uuid) -- Check that there is no dangling references after the error. - init = map_part_init() - res = g.router:exec(function(bid1, bid2) - local val, err, err_uuid = ivshard.router.map_part_callrw( - {bid1, bid2}, 'do_map', {3}, {timeout = iwait_timeout}) - return { - val = val, - err = err, - err_uuid = err_uuid, - } - end, {init.bids1[1], init.bids2[1]}) - t.assert_equals(res.err, nil) + local _, err = vtest.cluster_exec_each(cg, function() + ilt.assert_equals(require('vshard.storage.ref').count, 0) + end) + t.assert_equals(err, nil) + cg.replica_2_a:exec(function() + _G.do_map = _G.old_do_map + _G.old_do_map = nil + end) + res = router_do_map(cg.router, {bid1, bid2}, + {3}, {timeout = vtest.wait_timeout}) + t.assert_equals(res.err, nil, res.err) t.assert_equals(res.err_uuid, nil) t.assert_equals(res.val, { - [init.rs1_uuid] = {{1, 3}}, - [init.rs2_uuid] = {{2, 3}}, + [cg.rs1_uuid] = {{cg.rs1_uuid, 3}}, + [cg.rs2_uuid] = {{cg.rs2_uuid, 3}}, }) end + +g.test_map_part_callrw_raw = function(cg) + t.run_only_if(vutil.feature.netbox_return_raw) + -- + -- Successful map. + -- + local bid1 = vtest.storage_first_bucket(cg.replica_1_a) + local bid2 = vtest.storage_first_bucket(cg.replica_2_a) + local res = router_do_map(cg.router, {bid1, bid2}, {3}, + {timeout = vtest.wait_timeout, return_raw = true}) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid, 3}}, + [cg.rs2_uuid] = {{cg.rs2_uuid, 3}}, + }) + t.assert_equals(res.val_type, 'userdata') + t.assert(not res.err) + -- + -- Successful map, but one of the storages returns nothing. + -- + cg.replica_2_a:exec(function() + rawset(_G, 'old_do_map', _G.do_map) + _G.do_map = function() + return + end + end) + res = router_do_map(cg.router, {bid1, bid2}, {}, + {timeout = vtest.wait_timeout, return_raw = true}) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid}}, + }) + -- + -- Error at map stage. + -- + cg.replica_2_a:exec(function() + _G.do_map = function() + return box.error(box.error.PROC_LUA, "map_err") + end + end) + res = router_do_map(cg.router, {bid1, bid2}, {}, + {timeout = vtest.wait_timeout, return_raw = true}) + t.assert_equals(res.val, nil) + t.assert_covers(res.err, { + code = box.error.PROC_LUA, + type = 'ClientError', + message = 'map_err' + }, 'error object') + t.assert_equals(res.err_uuid, cg.rs2_uuid, 'error uuid') + -- + -- Cleanup. + -- + cg.replica_2_a:exec(function() + _G.do_map = _G.old_do_map + _G.old_do_map = nil + end) +end diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua index f9cd6920..022f578c 100644 --- a/test/router-luatest/router_test.lua +++ b/test/router-luatest/router_test.lua @@ -288,21 +288,16 @@ end g.test_group_buckets = function(g) local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2) - - local res = g.router:exec(function(bid1, bid2) - local val, err = ivshard.router._buckets_group({bid2, bid1, bid1}) - return { - val = val, - err = err, - } + local val, err = g.router:exec(function(bid1, bid2) + return ivshard.router._buckets_group({bid2, bid1, bid1, bid2}, + ivtest.wait_timeout) end, {bids[1], bids[2]}) - assert(res.err == nil) + assert(err == nil) local rs1_uuid = g.replica_1_a:replicaset_uuid() local expected = { - [rs1_uuid] = {bids[1], bids[2]}, + [rs1_uuid] = {bids[2], bids[1]}, } - table.sort(expected[rs1_uuid]) - t.assert_equals(res.val, expected) + t.assert_equals(val, expected) end g.test_uri_compare_and_reuse = function(g) diff --git a/test/router/router.result b/test/router/router.result index 8f678564..9ad9bce1 100644 --- a/test/router/router.result +++ b/test/router/router.result @@ -1503,6 +1503,7 @@ error_messages - Use replicaset:update_master(...) instead of replicaset.update_master(...) - Use replicaset:wait_connected(...) instead of replicaset.wait_connected(...) - Use replicaset:wait_connected_all(...) instead of replicaset.wait_connected_all(...) + - Use replicaset:wait_master(...) instead of replicaset.wait_master(...) ... _, replica = next(replicaset.replicas) --- diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua index 9e760ebe..0e8d8bd4 100644 --- a/test/storage-luatest/storage_1_test.lua +++ b/test/storage-luatest/storage_1_test.lua @@ -226,33 +226,34 @@ test_group.test_bucket_skip_validate = function(g) end) end -test_group.test_ref_with_lookup = function(g) +test_group.test_ref_with_buckets = function(g) g.replica_1_a:exec(function() + local lref = require('vshard.storage.ref') local res, err, _ - local timeout = 0.1 local rid = 42 local bids = _G.get_n_buckets(2) - local bid_extra = 3001 + local bucket_count = ivshard.storage.internal.total_bucket_count - -- Check for a single bucket. + -- No buckets. res, err = ivshard.storage._call( - 'storage_ref_with_lookup', - rid, - timeout, - {bids[1]} - ) + 'storage_ref_with_buckets', rid, iwait_timeout, {}) + ilt.assert_equals(err, nil) + ilt.assert_equals(res, {moved = {}}) + ilt.assert_equals(lref.count, 0) + + -- Check for a single ok bucket. + res, err = ivshard.storage._call( + 'storage_ref_with_buckets', rid, iwait_timeout, {bids[1]}) ilt.assert_equals(err, nil) ilt.assert_equals(res, {rid = rid, moved = {}}) + ilt.assert_equals(lref.count, 1) _, err = ivshard.storage._call('storage_unref', rid) ilt.assert_equals(err, nil) + ilt.assert_equals(lref.count, 0) - -- Check for multiple buckets. + -- Check for multiple ok buckets. res, err = ivshard.storage._call( - 'storage_ref_with_lookup', - rid, - timeout, - {bids[1], bids[2]} - ) + 'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[2]}) ilt.assert_equals(err, nil) ilt.assert_equals(res, {rid = rid, moved = {}}) _, err = ivshard.storage._call('storage_unref', rid) @@ -260,83 +261,69 @@ test_group.test_ref_with_lookup = function(g) -- Check for double referencing. res, err = ivshard.storage._call( - 'storage_ref_with_lookup', - rid, - timeout, - {bids[1], bids[1]} - ) + 'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[1]}) ilt.assert_equals(err, nil) ilt.assert_equals(res, {rid = rid, moved = {}}) + ilt.assert_equals(lref.count, 1) _, err = ivshard.storage._call('storage_unref', rid) ilt.assert_equals(err, nil) - _, err = ivshard.storage._call('storage_unref', rid) - t.assert_str_contains(err.message, - 'Can not delete a storage ref: no ref') + ilt.assert_equals(lref.count, 0) - -- Check for an absent bucket. + -- Bucket mix. res, err = ivshard.storage._call( - 'storage_ref_with_lookup', - rid, - timeout, - {bids[1], bids[2], bid_extra} - ) + 'storage_ref_with_buckets', rid, iwait_timeout, + {bucket_count + 1, bids[1], bucket_count + 2, bids[2], + bucket_count + 3}) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {rid = rid, moved = {bid_extra}}) + ilt.assert_equals(res, { + rid = rid, + moved = {bucket_count + 1, bucket_count + 2, bucket_count + 3} + }) _, err = ivshard.storage._call('storage_unref', rid) ilt.assert_equals(err, nil) - -- Check that we do not create a reference if there are no buckets. + -- No ref when all buckets are missing. res, err = ivshard.storage._call( - 'storage_ref_with_lookup', + 'storage_ref_with_buckets', rid, - timeout, - {bid_extra} + iwait_timeout, + {bucket_count + 1, bucket_count + 2} ) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {rid = nil, moved = {bid_extra}}) - res, err = ivshard.storage._call('storage_unref', rid) - t.assert_str_contains(err.message, - 'Can not delete a storage ref: no ref') - ilt.assert_equals(res, nil) - - -- Check for a timeout. - -- Emulate a case when all buckets are not writable. - local func = ivshard.storage.internal.bucket_are_all_rw - ivshard.storage.internal.bucket_are_all_rw = function() return false end + ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}}) + ilt.assert_equals(lref.count, 0) + + -- Timeout when some buckets aren't writable. Doesn't have to be the + -- same buckets as for moving. + _G.bucket_recovery_pause() + box.space._bucket:update( + {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}}) res, err = ivshard.storage._call( - 'storage_ref_with_lookup', - rid, - timeout, - {bids[1]} - ) - ivshard.storage.internal.bucket_are_all_rw = func + 'storage_ref_with_buckets', rid, 0.01, {bids[2]}) + box.space._bucket:update( + {bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) + _G.bucket_recovery_continue() t.assert_str_contains(err.message, 'Timeout exceeded') ilt.assert_equals(res, nil) - -- Check that the reference was not created. - res, err = ivshard.storage._call('storage_unref', rid) - t.assert_str_contains(err.message, - 'Can not delete a storage ref: no ref') - ilt.assert_equals(res, nil) + ilt.assert_equals(lref.count, 0) end) end -test_group.test_absent_buckets = function(g) +test_group.test_moved_buckets = function(g) g.replica_1_a:exec(function() local res, err = ivshard.storage._call( - 'storage_absent_buckets', + 'moved_buckets', {_G.get_first_bucket()} ) ilt.assert_equals(err, nil) ilt.assert_equals(res, {moved = {}}) - end) - g.replica_1_a:exec(function() - local bid_extra = 3001 - local res, err = ivshard.storage._call( - 'storage_absent_buckets', - {_G.get_first_bucket(), bid_extra} + local bucket_count = ivshard.storage.internal.total_bucket_count + res, err = ivshard.storage._call( + 'moved_buckets', + {_G.get_first_bucket(), bucket_count + 1, bucket_count + 2} ) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {moved = {bid_extra}}) + ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}}) end) end diff --git a/test/upgrade/upgrade.result b/test/upgrade/upgrade.result index c5b99ba0..5e8e71f5 100644 --- a/test/upgrade/upgrade.result +++ b/test/upgrade/upgrade.result @@ -174,13 +174,13 @@ vshard.storage._call('test_api', 1, 2, 3) | - - bucket_recv | - bucket_test_gc | - info + | - moved_buckets | - rebalancer_apply_routes | - rebalancer_request_state | - recovery_bucket_stat - | - storage_absent_buckets | - storage_map | - storage_ref - | - storage_ref_with_lookup + | - storage_ref_with_buckets | - storage_unref | - test_api | - 1 diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua index 086ed4ec..4f93bdee 100644 --- a/vshard/replicaset.lua +++ b/vshard/replicaset.lua @@ -1514,15 +1514,31 @@ local function buildall(sharding_cfg) return new_replicasets end --- --- Wait for masters connection during RECONNECT_TIMEOUT seconds. --- -local function wait_masters_connect(replicasets) - for _, rs in pairs(replicasets) do - if rs.master then - rs.master.conn:wait_connected(consts.RECONNECT_TIMEOUT) +local function wait_masters_connect(replicasets, timeout) + local err, err_id + -- Start connecting to all masters in parallel. + local deadline = fiber_clock() + timeout + for _, replicaset in pairs(replicasets) do + local master + master, err = replicaset:wait_master(timeout) + if not master then + err_id = replicaset.id + goto fail + end + replicaset:connect_replica(master) + timeout = deadline - fiber_clock() + end + -- Wait until all connections are established. + for _, replicaset in pairs(replicasets) do + timeout, err = replicaset:wait_connected(timeout) + if not timeout then + err_id = replicaset.id + goto fail end end + do return timeout end + ::fail:: + return nil, err, err_id end return { diff --git a/vshard/router/init.lua b/vshard/router/init.lua index 48bd9f4e..4fbb7432 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -225,32 +225,24 @@ end -- Group bucket ids by replicasets according to the router cache. local function buckets_group(router, bucket_ids, timeout) - timeout = timeout or consts.CALL_TIMEOUT_MIN local deadline = fiber_clock() + timeout local replicaset, err local replicaset_buckets = {} - local prev_id = nil local buckets, id - - -- Sort buckets to skip duplicates. - table.sort(bucket_ids) + local bucket_map = {} for _, bucket_id in pairs(bucket_ids) do - if bucket_id == prev_id then + if bucket_map[bucket_id] then goto continue end - + -- Avoid duplicates. + bucket_map[bucket_id] = true if fiber_clock() > deadline then return nil, lerror.timeout() end - replicaset, err = bucket_resolve(router, bucket_id) if err ~= nil then return nil, err end - if replicaset == nil then - return nil, lerror.vshard(lerror.code.NO_ROUTE_TO_BUCKET, bucket_id) - end - id = replicaset.id buckets = replicaset_buckets[id] if buckets then @@ -258,9 +250,7 @@ local function buckets_group(router, bucket_ids, timeout) else replicaset_buckets[id] = {bucket_id} end - ::continue:: - prev_id = bucket_id end return replicaset_buckets @@ -804,37 +794,6 @@ local function router_call(router, bucket_id, opts, ...) ...) end -local function wait_connected_to_masters(replicasets, timeout) - local master, id - local err, err_id - - -- Start connecting to all masters in parallel. - local deadline = fiber_clock() + timeout - for _, replicaset in pairs(replicasets) do - master, err = replicaset:wait_master(timeout) - id = replicaset.id - if not master then - err_id = id - goto fail - end - replicaset:connect_replica(master) - timeout = deadline - fiber_clock() - end - - -- Wait until all connections are established. - for _, replicaset in pairs(replicasets) do - timeout, err = replicaset:wait_connected(timeout) - if not timeout then - err_id = replicaset.id - goto fail - end - end - do return timeout end - - ::fail:: - return nil, err, err_id -end - -- -- Perform map-reduce stages on the given set of replicasets. The map expects a -- valid ref ID which must be done beforehand. @@ -989,11 +948,8 @@ local function router_map_callrw(router, func, args, opts) -- -- Netbox async requests work only with active connections. Need to wait -- for the connection explicitly. - local rs_list = table_new(0, #replicasets) - for _, rs in pairs(replicasets) do - table.insert(rs_list, rs) - end - timeout, err, err_id = wait_connected_to_masters(rs_list, timeout) + timeout, err, err_id = lreplicaset.wait_masters_connect( + replicasets, timeout) if not timeout then goto fail end @@ -1090,18 +1046,22 @@ end -- though all replicasets were scanned. -- local function router_map_part_callrw(router, bucket_ids, func, args, opts) - local replicaset local grouped_buckets - local err, err_id, res, ok, map - local call_args - local rs_list - local replicasets = {} - local preallocated = false + local group_count + local err, err_id, res, map + local replicasets_to_map = {} local futures = {} - local call_opts = {is_async = true} - local rs_count = 0 - local timeout = opts and opts.timeout or consts.CALL_TIMEOUT_MIN + local do_return_raw + local timeout + if opts then + timeout = opts.timeout or consts.CALL_TIMEOUT_MIN + do_return_raw = opts.return_raw + else + timeout = consts.CALL_TIMEOUT_MIN + end + local opts_async = {is_async = true} local deadline = fiber_clock() + timeout + local replicasets_all = router.replicasets local rid = M.ref_id M.ref_id = rid + 1 @@ -1109,7 +1069,7 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts) -- netbox requests return box.NULL instead of nils. -- Ref stage. - while #bucket_ids > 0 do + while next(bucket_ids) do -- Group the buckets by replicasets according to the router cache. grouped_buckets, err = buckets_group(router, bucket_ids, timeout) if grouped_buckets == nil then @@ -1119,37 +1079,35 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts) -- Netbox async requests work only with active connections. -- So, first need to wait for the master connection explicitly. - rs_list = table_new(0, #grouped_buckets) + local replicasets_to_check = {} + group_count = 0 for uuid, _ in pairs(grouped_buckets) do - replicaset = router.replicasets[uuid] - table.insert(rs_list, replicaset) + group_count = group_count + 1 + table.insert(replicasets_to_check, replicasets_all[uuid]) end - timeout, err, err_id = wait_connected_to_masters(rs_list, timeout) + timeout, err, err_id = lreplicaset.wait_masters_connect( + replicasets_to_check, timeout) if not timeout then goto fail end -- Send ref requests with timeouts to the replicasets. - futures = table_new(0, #grouped_buckets) + futures = table_new(0, group_count) for id, buckets in pairs(grouped_buckets) do - replicaset = router.replicasets[id] - -- Netbox async requests work only with active connections. Need to - -- wait for the connection explicitly. - timeout, err = replicaset:wait_connected(timeout) if timeout == nil then err_id = id goto fail end - if replicasets[id] then - -- Replicaset is already referred on the previous iteration. + local args_ref + if replicasets_to_map[id] then + -- Replicaset is already referenced on a previous iteration. -- Simply get the moved buckets without double referencing. - call_args = {'storage_absent_buckets', buckets} + args_ref = {'moved_buckets', buckets} else - call_args = {'storage_ref_with_lookup', rid, timeout, buckets} - rs_count = rs_count + 1 + args_ref = {'storage_ref_with_buckets', rid, timeout, buckets} end - res, err = replicaset:callrw('vshard.storage._call', call_args, - call_opts) + res, err = replicasets_all[id]:callrw('vshard.storage._call', + args_ref, opts_async) if res == nil then err_id = id goto fail @@ -1157,17 +1115,10 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts) futures[id] = res end - if not preallocated then - -- Current preallocation works only for the first iteration - -- (i.e. router cache is not outdated). - replicasets = table_new(0, rs_count) - preallocated = true - end - -- Wait for the refs to be done and collect moved buckets. bucket_ids = {} - for id, future in pairs(futures) do - res, err = future_wait(future, timeout) + for id, f in pairs(futures) do + res, err = future_wait(f, timeout) -- Handle netbox error first. if res == nil then err_id = id @@ -1184,71 +1135,23 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts) table.insert(bucket_ids, bucket_id) end if res.rid then - -- If there are no buckets on the replicaset, - -- it would not be referred. - replicasets[id] = router.replicasets[id] + assert(not replicasets_to_map[id]) + -- If there are no buckets on the replicaset, it would not be + -- referenced. + replicasets_to_map[id] = replicasets_all[id] end timeout = deadline - fiber_clock() end end - - -- Map stage. - map = table_new(0, rs_count) - futures = table_new(0, rs_count) - args = {'storage_map', rid, func, args} - -- Send map requests. - for id, rs in pairs(replicasets) do - res, err = rs:callrw('vshard.storage._call', args, call_opts) - if res == nil then - err_id = id - goto fail - end - futures[id] = res - end - - -- Reduce stage. - -- Collect map responses (refs were deleted by the storages for non-error - -- results). - for id, future in pairs(futures) do - res, err = future_wait(future, timeout) - if res == nil then - err_id = id - goto fail - end - -- Map returns true,res or nil,err. - ok, res = res[1], res[2] - if ok == nil then - err = res - err_id = id - goto fail - end - if res ~= nil then - -- Store as a table so in future it could be extended for - -- multireturn. - map[id] = {res} - end - timeout = deadline - fiber_clock() + map, err, err_id = replicasets_map_reduce( + replicasets_to_map, rid, func, args, { + timeout = timeout, return_raw = do_return_raw + }) + if map then + return map end - do return map end - ::fail:: - local f - for id, future in pairs(futures) do - future:discard() - -- Best effort to remove the created refs before exiting. Can help if - -- the timeout was big and the error happened early. - call_args = {'storage_unref', rid} - replicaset = router.replicasets[id] - if replicaset then - f = replicaset:callrw('vshard.storage._call', call_args, call_opts) - if f ~= nil then - -- Don't care waiting for a result - no time for this. But it - -- won't affect the request sending if the connection is still - -- alive. - f:discard() - end - end - end + replicasets_map_cancel_refs(replicasets_all, futures, rid) err = lerror.make(err) return nil, err, err_id end @@ -1652,7 +1555,7 @@ local function router_cfg(router, cfg, is_reload) for _, replicaset in pairs(new_replicasets) do replicaset:connect_all() end - lreplicaset.wait_masters_connect(new_replicasets) + lreplicaset.wait_masters_connect(new_replicasets, consts.RECONNECT_TIMEOUT) lreplicaset.outdate_replicasets(router.replicasets, vshard_cfg.connection_outdate_delay) router.connection_outdate_delay = vshard_cfg.connection_outdate_delay diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index 2fbfba27..a8fa3a07 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -3187,7 +3187,7 @@ end -- @return A dictionary with a list of bucket identifiers which are -- not present on the current instance. -- -local function storage_absent_buckets(bucket_ids) +local function storage_moved_buckets(bucket_ids) local status = consts.BUCKET local moved_buckets = {} for _, bucket_id in pairs(bucket_ids) do @@ -3212,31 +3212,13 @@ end -- are absent, the reference is not created and a nil reference id -- with the list of absent buckets is returned. -- -local function storage_ref_with_lookup(rid, timeout, bucket_ids) - local moved = storage_absent_buckets(bucket_ids).moved +local function storage_ref_with_buckets(rid, timeout, bucket_ids) + local moved = storage_moved_buckets(bucket_ids).moved if #moved == #bucket_ids then - -- Take an advantage that moved buckets are returned in the same - -- order as in the input list. - local do_match = true - local next_moved = next(moved) - local next_passed = next(bucket_ids) - ::continue:: - if next_moved then - if next_moved == next_passed then - next_moved = next(moved, next_moved) - next_passed = next(bucket_ids, next_passed) - goto continue - else - do_match = false - end - end - if do_match then - -- If all the passed buckets are absent, there is no need - -- to create a ref. - return {rid = nil, moved = moved} - end + -- If all the passed buckets are absent, there is no need to create a + -- ref. + return {rid = nil, moved = moved} end - local ok, err = storage_ref(rid, timeout) if not ok then return nil, err @@ -3298,9 +3280,9 @@ service_call_api = setmetatable({ rebalancer_apply_routes = rebalancer_apply_routes, rebalancer_request_state = rebalancer_request_state, recovery_bucket_stat = recovery_bucket_stat, - storage_absent_buckets = storage_absent_buckets, + moved_buckets = storage_moved_buckets, storage_ref = storage_ref, - storage_ref_with_lookup = storage_ref_with_lookup, + storage_ref_with_buckets = storage_ref_with_buckets, storage_unref = storage_unref, storage_map = storage_map, info = storage_service_info, From b2c3c6e140040d45753b5ab4b6e55b4afdd29df3 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Thu, 3 Oct 2024 12:11:26 +0200 Subject: [PATCH 11/17] router: move Ref stage of Map-Reduce into new func There are 2 Ref-Map-Reduce functions right now - map_callrw() and map_part_callrw(). Their only difference is that the former refs the whole cluster, while the latter refs only a subset of storages. The rest is the same. There is an idea, that better lets merge these functions into one and make the bucket IDs an option. The commit extracts the Ref stages of both functions into separate helpers which will allow to keep this future single function very short and simple. NO_DOC=internal NO_TEST=refactoring --- vshard/router/init.lua | 362 ++++++++++++++++++++++------------------- 1 file changed, 195 insertions(+), 167 deletions(-) diff --git a/vshard/router/init.lua b/vshard/router/init.lua index 4fbb7432..d37890d4 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -794,6 +794,178 @@ local function router_call(router, bucket_id, opts, ...) ...) end +-- +-- Perform Ref stage of the Ref-Map-Reduce process on all the known replicasets. +-- +local function router_ref_storage_all(router, timeout) + local replicasets = router.replicasets + local deadline = fiber_clock() + timeout + local err, err_id, res + local futures = {} + local bucket_count = 0 + local opts_async = {is_async = true} + local rs_count = 0 + local rid = M.ref_id + M.ref_id = rid + 1 + -- Nil checks are done explicitly here (== nil instead of 'not'), because + -- netbox requests return box.NULL instead of nils. + + -- + -- Ref stage: send. + -- + -- Netbox async requests work only with active connections. Need to wait + -- for the connection explicitly. + timeout, err, err_id = lreplicaset.wait_masters_connect( + replicasets, timeout) + if not timeout then + goto fail + end + for id, rs in pairs(replicasets) do + res, err = rs:callrw('vshard.storage._call', + {'storage_ref', rid, timeout}, opts_async) + if res == nil then + err_id = id + goto fail + end + futures[id] = res + rs_count = rs_count + 1 + end + -- + -- Ref stage: collect. + -- + for id, future in pairs(futures) do + res, err = future_wait(future, timeout) + -- Handle netbox error first. + if res == nil then + err_id = id + goto fail + end + -- Ref returns nil,err or bucket count. + res, err = res[1], res[2] + if res == nil then + err_id = id + goto fail + end + bucket_count = bucket_count + res + timeout = deadline - fiber_clock() + end + -- All refs are done but not all buckets are covered. This is odd and can + -- mean many things. The most possible ones: 1) outdated configuration on + -- the router and it does not see another replicaset with more buckets, + -- 2) some buckets are simply lost or duplicated - could happen as a bug, or + -- if the user does a maintenance of some kind by creating/deleting buckets. + -- In both cases can't guarantee all the data would be covered by Map calls. + if bucket_count ~= router.total_bucket_count then + err = lerror.vshard(lerror.code.UNKNOWN_BUCKETS, + router.total_bucket_count - bucket_count) + goto fail + end + do return timeout, nil, nil, rid, futures, replicasets end + + ::fail:: + return nil, err, err_id, rid, futures +end + +-- +-- Perform Ref stage of the Ref-Map-Reduce process on a subset of all the +-- replicasets, which contains all the listed bucket IDs. +-- +local function router_ref_storage_by_buckets(router, bucket_ids, timeout) + local grouped_buckets + local group_count + local err, err_id, res + local replicasets_all = router.replicasets + local replicasets_to_map = {} + local futures = {} + local opts_async = {is_async = true} + local deadline = fiber_clock() + timeout + local rid = M.ref_id + M.ref_id = rid + 1 + + -- Nil checks are done explicitly here (== nil instead of 'not'), because + -- netbox requests return box.NULL instead of nils. + + -- Ref stage. + while next(bucket_ids) do + -- Group the buckets by replicasets according to the router cache. + grouped_buckets, err = buckets_group(router, bucket_ids, timeout) + if grouped_buckets == nil then + goto fail + end + timeout = deadline - fiber_clock() + + -- Netbox async requests work only with active connections. + -- So, first need to wait for the master connection explicitly. + local replicasets_to_check = {} + group_count = 0 + for uuid, _ in pairs(grouped_buckets) do + group_count = group_count + 1 + table.insert(replicasets_to_check, replicasets_all[uuid]) + end + timeout, err, err_id = lreplicaset.wait_masters_connect( + replicasets_to_check, timeout) + if not timeout then + goto fail + end + + -- Send ref requests with timeouts to the replicasets. + futures = table_new(0, group_count) + for id, buckets in pairs(grouped_buckets) do + if timeout == nil then + err_id = id + goto fail + end + local args_ref + if replicasets_to_map[id] then + -- Replicaset is already referenced on a previous iteration. + -- Simply get the moved buckets without double referencing. + args_ref = {'moved_buckets', buckets} + else + args_ref = {'storage_ref_with_buckets', rid, timeout, buckets} + end + res, err = replicasets_all[id]:callrw('vshard.storage._call', + args_ref, opts_async) + if res == nil then + err_id = id + goto fail + end + futures[id] = res + end + + -- Wait for the refs to be done and collect moved buckets. + bucket_ids = {} + for id, f in pairs(futures) do + res, err = future_wait(f, timeout) + -- Handle netbox error first. + if res == nil then + err_id = id + goto fail + end + -- Ref returns nil,err or {rid, moved}. + res, err = res[1], res[2] + if res == nil then + err_id = id + goto fail + end + for _, bucket_id in pairs(res.moved) do + bucket_reset(router, bucket_id) + table.insert(bucket_ids, bucket_id) + end + if res.rid then + assert(not replicasets_to_map[id]) + -- If there are no buckets on the replicaset, it would not be + -- referenced. + replicasets_to_map[id] = replicasets_all[id] + end + timeout = deadline - fiber_clock() + end + end + do return timeout, nil, nil, rid, futures, replicasets_to_map end + + ::fail:: + return nil, err, err_id, rid, futures +end + -- -- Perform map-reduce stages on the given set of replicasets. The map expects a -- valid ref ID which must be done beforehand. @@ -923,83 +1095,25 @@ end -- found even though all replicasets were scanned. -- local function router_map_callrw(router, func, args, opts) - local replicasets = router.replicasets - local timeout - local do_return_raw + local replicasets, err, err_id, map, futures, rid + local timeout, do_return_raw if opts then timeout = opts.timeout or consts.CALL_TIMEOUT_MIN do_return_raw = opts.return_raw else timeout = consts.CALL_TIMEOUT_MIN end - local deadline = fiber_clock() + timeout - local err, err_id, res, map - local futures = {} - local bucket_count = 0 - local opts_async = {is_async = true} - local rs_count = 0 - local rid = M.ref_id - M.ref_id = rid + 1 - -- Nil checks are done explicitly here (== nil instead of 'not'), because - -- netbox requests return box.NULL instead of nils. - - -- - -- Ref stage: send. - -- - -- Netbox async requests work only with active connections. Need to wait - -- for the connection explicitly. - timeout, err, err_id = lreplicaset.wait_masters_connect( - replicasets, timeout) - if not timeout then - goto fail - end - for id, rs in pairs(replicasets) do - res, err = rs:callrw('vshard.storage._call', - {'storage_ref', rid, timeout}, opts_async) - if res == nil then - err_id = id - goto fail + timeout, err, err_id, rid, futures, replicasets = + router_ref_storage_all(router, timeout) + if timeout then + map, err, err_id = replicasets_map_reduce(replicasets, rid, func, + args, { + timeout = timeout, return_raw = do_return_raw + }) + if map then + return map end - futures[id] = res - rs_count = rs_count + 1 end - -- - -- Ref stage: collect. - -- - for id, future in pairs(futures) do - res, err = future_wait(future, timeout) - -- Handle netbox error first. - if res == nil then - err_id = id - goto fail - end - -- Ref returns nil,err or bucket count. - res, err = res[1], res[2] - if res == nil then - err_id = id - goto fail - end - bucket_count = bucket_count + res - timeout = deadline - fiber_clock() - end - -- All refs are done but not all buckets are covered. This is odd and can - -- mean many things. The most possible ones: 1) outdated configuration on - -- the router and it does not see another replicaset with more buckets, - -- 2) some buckets are simply lost or duplicated - could happen as a bug, or - -- if the user does a maintenance of some kind by creating/deleting buckets. - -- In both cases can't guarantee all the data would be covered by Map calls. - if bucket_count ~= router.total_bucket_count then - err = lerror.vshard(lerror.code.UNKNOWN_BUCKETS, - router.total_bucket_count - bucket_count) - goto fail - end - map, err, err_id = replicasets_map_reduce(replicasets, rid, func, args, { - timeout = timeout, return_raw = do_return_raw - }) - if map then - return map - end -::fail:: replicasets_map_cancel_refs(replicasets, futures, rid) err = lerror.make(err) return nil, err, err_id @@ -1046,112 +1160,26 @@ end -- though all replicasets were scanned. -- local function router_map_part_callrw(router, bucket_ids, func, args, opts) - local grouped_buckets - local group_count - local err, err_id, res, map - local replicasets_to_map = {} - local futures = {} - local do_return_raw - local timeout + local timeout, err, err_id, map, futures, do_return_raw, rid + local replicasets_to_map if opts then timeout = opts.timeout or consts.CALL_TIMEOUT_MIN do_return_raw = opts.return_raw else timeout = consts.CALL_TIMEOUT_MIN end - local opts_async = {is_async = true} - local deadline = fiber_clock() + timeout - local replicasets_all = router.replicasets - local rid = M.ref_id - M.ref_id = rid + 1 - - -- Nil checks are done explicitly here (== nil instead of 'not'), because - -- netbox requests return box.NULL instead of nils. - - -- Ref stage. - while next(bucket_ids) do - -- Group the buckets by replicasets according to the router cache. - grouped_buckets, err = buckets_group(router, bucket_ids, timeout) - if grouped_buckets == nil then - goto fail - end - timeout = deadline - fiber_clock() - - -- Netbox async requests work only with active connections. - -- So, first need to wait for the master connection explicitly. - local replicasets_to_check = {} - group_count = 0 - for uuid, _ in pairs(grouped_buckets) do - group_count = group_count + 1 - table.insert(replicasets_to_check, replicasets_all[uuid]) + timeout, err, err_id, rid, futures, replicasets_to_map = + router_ref_storage_by_buckets(router, bucket_ids, timeout) + if timeout then + map, err, err_id = replicasets_map_reduce( + replicasets_to_map, rid, func, args, { + timeout = timeout, return_raw = do_return_raw + }) + if map then + return map end - timeout, err, err_id = lreplicaset.wait_masters_connect( - replicasets_to_check, timeout) - if not timeout then - goto fail - end - - -- Send ref requests with timeouts to the replicasets. - futures = table_new(0, group_count) - for id, buckets in pairs(grouped_buckets) do - if timeout == nil then - err_id = id - goto fail - end - local args_ref - if replicasets_to_map[id] then - -- Replicaset is already referenced on a previous iteration. - -- Simply get the moved buckets without double referencing. - args_ref = {'moved_buckets', buckets} - else - args_ref = {'storage_ref_with_buckets', rid, timeout, buckets} - end - res, err = replicasets_all[id]:callrw('vshard.storage._call', - args_ref, opts_async) - if res == nil then - err_id = id - goto fail - end - futures[id] = res - end - - -- Wait for the refs to be done and collect moved buckets. - bucket_ids = {} - for id, f in pairs(futures) do - res, err = future_wait(f, timeout) - -- Handle netbox error first. - if res == nil then - err_id = id - goto fail - end - -- Ref returns nil,err or {rid, moved}. - res, err = res[1], res[2] - if res == nil then - err_id = id - goto fail - end - for _, bucket_id in pairs(res.moved) do - bucket_reset(router, bucket_id) - table.insert(bucket_ids, bucket_id) - end - if res.rid then - assert(not replicasets_to_map[id]) - -- If there are no buckets on the replicaset, it would not be - -- referenced. - replicasets_to_map[id] = replicasets_all[id] - end - timeout = deadline - fiber_clock() - end - end - map, err, err_id = replicasets_map_reduce( - replicasets_to_map, rid, func, args, { - timeout = timeout, return_raw = do_return_raw - }) - if map then - return map end -::fail:: - replicasets_map_cancel_refs(replicasets_all, futures, rid) + replicasets_map_cancel_refs(router.replicasets, futures, rid) err = lerror.make(err) return nil, err, err_id end From c72abb982aea27c78f62455f18eb331babf10bd7 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Thu, 3 Oct 2024 12:18:58 +0200 Subject: [PATCH 12/17] router: merge map_callrw and map_part_callrw The behavior is regulated with the new bucket_ids option. @TarantoolBot document Title: vshard: `bucket_ids` option for `router.map_callrw()` The option is an array of numeric bucket IDs. When specified, the Ref-Map-Reduce is only performed on the masters having at least one of these buckets. By default all the stages are done on all masters in the cluster. Example: ```Lua -- Assume buckets 1, 2, 3 cover replicasets UUID_A and UUID_B. res, err = vshard.router.map_callrw(func, args, {bucket_ids = {1, 2, 3}}) assert(res[UUID_A] == {func_result_from_A}) assert(res[UUID_B] == {func_result_from_B}) ``` --- test/router-luatest/map_part_test.lua | 94 ++++++++++++++-------- vshard/router/init.lua | 108 +++++++------------------- 2 files changed, 91 insertions(+), 111 deletions(-) diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua index 1a606b2c..cdaccd28 100644 --- a/test/router-luatest/map_part_test.lua +++ b/test/router-luatest/map_part_test.lua @@ -38,12 +38,12 @@ g.before_all(function(cg) cg.router = router local res, err = router:exec(function() local res, err = ivshard.router.bootstrap({timeout = iwait_timeout}) - rawset(_G, 'do_map', function(bids, args, opts) - local old_bids = table.copy(bids) - local val, err, err_uuid = ivshard.router.map_part_callrw( - bids, 'do_map', args, opts) - -- Make sure the bucket list isn't changed by vshard. - ilt.assert_equals(bids, old_bids) + rawset(_G, 'do_map', function(args, opts) + local old_opts = table.copy(opts) + local val, err, err_uuid = ivshard.router.map_callrw( + 'do_map', args, opts) + -- Make sure the options aren't changed by vshard. + ilt.assert_equals(old_opts, opts) local val_type if opts.return_raw and val ~= nil then -- Src+value. The src is plain Lua data. The value is raw. @@ -82,16 +82,18 @@ g.after_all(function(cg) cg.cluster:drop() end) -local function router_do_map(router, bids, args, opts) - return router:exec(function(bids, args, opts) - return _G.do_map(bids, args, opts) - end, {bids, args, opts}) +local function router_do_map(router, args, opts) + return router:exec(function(args, opts) + return _G.do_map(args, opts) + end, {args, opts}) end g.test_map_part_single_rs = function(cg) local bids = vtest.storage_get_n_buckets(cg.replica_1_a, 4) - local res = router_do_map(cg.router, {bids[3], bids[2]}, - {123}, {timeout = vtest.wait_timeout}) + local res = router_do_map(cg.router, {123}, { + timeout = vtest.wait_timeout, + bucket_ids = {bids[3], bids[2]}, + }) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) t.assert_equals(res.val, { @@ -102,8 +104,10 @@ end g.test_map_part_multi_rs = function(cg) local bid1 = vtest.storage_first_bucket(cg.replica_1_a) local bid2 = vtest.storage_first_bucket(cg.replica_2_a) - local res = router_do_map(cg.router, {bid1, bid2}, - {123}, {timeout = vtest.wait_timeout}) + local res = router_do_map(cg.router, {123}, { + timeout = vtest.wait_timeout, + bucket_ids = {bid1, bid2}, + }) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) t.assert_equals(res.val, { @@ -123,8 +127,10 @@ g.test_map_part_ref = function(cg) _G.bucket_send(bid2, to) end, {bids1[1], bids1[2], cg.rs2_uuid}) -- The buckets are ACTIVE on rs2, so the partial map should succeed. - local res = router_do_map(cg.router, {bids1[1], bids1[2]}, - {42}, {timeout = vtest.wait_timeout}) + local res = router_do_map(cg.router, {42}, { + timeout = vtest.wait_timeout, + bucket_ids = {bids1[1], bids1[2]}, + }) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) t.assert_equals(res.val, { @@ -133,7 +139,10 @@ g.test_map_part_ref = function(cg) -- But if we use some active bucket from rs1, the partial map should fail. -- The reason is that the moved buckets are still in the SENT state and -- we can't take a ref. - res = router_do_map(cg.router, {bids1[3]}, {42}, {timeout = 0.1}) + res = router_do_map(cg.router, {42}, { + timeout = 0.1, + bucket_ids = {bids1[3]}, + }) t.assert_equals(res.val, nil) t.assert(res.err) t.assert_equals(res.err_uuid, cg.rs1_uuid) @@ -152,8 +161,10 @@ g.test_map_part_ref = function(cg) _G.bucket_send(bid2, to) end, {bids1[1], bids1[2], cg.rs1_uuid}) - res = router_do_map(cg.router, {bids1[1], bids1[2]}, - {42}, {timeout = vtest.wait_timeout}) + res = router_do_map(cg.router, {42}, { + timeout = vtest.wait_timeout, + bucket_ids = {bids1[1], bids1[2]}, + }) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) t.assert_equals(res.val, { @@ -187,8 +198,10 @@ g.test_map_part_double_ref = function(cg) -- from rs2. The ref stage should be done in two steps: -- 1. ref rs2 and returns the moved bucket; -- 2. discover the moved bucket on rs2 and avoid double reference; - local res = router_do_map(cg.router, {bid1, bid2}, - {42}, {timeout = vtest.wait_timeout}) + local res = router_do_map(cg.router, {42}, { + timeout = vtest.wait_timeout, + bucket_ids = {bid1, bid2}, + }) t.assert_equals(res.err, nil) t.assert_equals(res.err_uuid, nil) t.assert_equals(res.val, { @@ -256,8 +269,10 @@ g.test_map_part_ref_timeout = function(cg) end end) local f = fiber.new(function() - return router_do_map(cg.router, {bid1, bid2, bid3, bid4}, - {42}, {timeout = vtest.wait_timeout}) + return router_do_map(cg.router, {42}, { + timeout = vtest.wait_timeout, + bucket_ids = {bid1, bid2, bid3, bid4}, + }) end) f:set_joinable(true) cg.replica_2_a:exec(function() @@ -307,8 +322,10 @@ g.test_map_part_map = function(cg) return box.error(box.error.PROC_LUA, "map_err") end end) - local res = router_do_map(cg.router, {bid1, bid2}, - {3}, {timeout = vtest.wait_timeout}) + local res = router_do_map(cg.router, {3}, { + timeout = vtest.wait_timeout, + bucket_ids = {bid1, bid2}, + }) t.assert_equals(res.val, nil) t.assert_covers(res.err, { code = box.error.PROC_LUA, @@ -325,8 +342,10 @@ g.test_map_part_map = function(cg) _G.do_map = _G.old_do_map _G.old_do_map = nil end) - res = router_do_map(cg.router, {bid1, bid2}, - {3}, {timeout = vtest.wait_timeout}) + res = router_do_map(cg.router, {3}, { + timeout = vtest.wait_timeout, + bucket_ids = {bid1, bid2}, + }) t.assert_equals(res.err, nil, res.err) t.assert_equals(res.err_uuid, nil) t.assert_equals(res.val, { @@ -342,8 +361,11 @@ g.test_map_part_callrw_raw = function(cg) -- local bid1 = vtest.storage_first_bucket(cg.replica_1_a) local bid2 = vtest.storage_first_bucket(cg.replica_2_a) - local res = router_do_map(cg.router, {bid1, bid2}, {3}, - {timeout = vtest.wait_timeout, return_raw = true}) + local res = router_do_map(cg.router, {3}, { + timeout = vtest.wait_timeout, + return_raw = true, + bucket_ids = {bid1, bid2}, + }) t.assert_equals(res.val, { [cg.rs1_uuid] = {{cg.rs1_uuid, 3}}, [cg.rs2_uuid] = {{cg.rs2_uuid, 3}}, @@ -359,8 +381,11 @@ g.test_map_part_callrw_raw = function(cg) return end end) - res = router_do_map(cg.router, {bid1, bid2}, {}, - {timeout = vtest.wait_timeout, return_raw = true}) + res = router_do_map(cg.router, {}, { + timeout = vtest.wait_timeout, + return_raw = true, + bucket_ids = {bid1, bid2}, + }) t.assert_equals(res.val, { [cg.rs1_uuid] = {{cg.rs1_uuid}}, }) @@ -372,8 +397,11 @@ g.test_map_part_callrw_raw = function(cg) return box.error(box.error.PROC_LUA, "map_err") end end) - res = router_do_map(cg.router, {bid1, bid2}, {}, - {timeout = vtest.wait_timeout, return_raw = true}) + res = router_do_map(cg.router, {}, { + timeout = vtest.wait_timeout, + return_raw = true, + bucket_ids = {bid1, bid2}, + }) t.assert_equals(res.val, nil) t.assert_covers(res.err, { code = box.error.PROC_LUA, diff --git a/vshard/router/init.lua b/vshard/router/init.lua index d37890d4..0430b596 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -1056,9 +1056,13 @@ local function replicasets_map_cancel_refs(replicasets, futures, rid) end -- --- Consistent Map-Reduce. The given function is called on all masters in the --- cluster with a guarantee that in case of success it was executed with all --- buckets being accessible for reads and writes. +-- Consistent Map-Reduce. The given function is called on masters in the cluster +-- with a guarantee that in case of success it was executed with all buckets +-- being accessible for reads and writes. +-- +-- The selection of masters depends on bucket_ids option. When specified, the +-- Map-Reduce is performed only on masters having at least one of these buckets. +-- Otherwise it is executed on all the masters in the cluster. -- -- Consistency in scope of map-reduce means all the data was accessible, and -- didn't move during map requests execution. To preserve the consistency there @@ -1080,11 +1084,19 @@ end -- @param router Router instance to use. -- @param func Name of the function to call. -- @param args Function arguments passed in netbox style (as an array). --- @param opts Can only contain 'timeout' as a number of seconds. Note that the --- refs may end up being kept on the storages during this entire timeout if --- something goes wrong. For instance, network issues appear. This means --- better not use a value bigger than necessary. A stuck infinite ref can --- only be dropped by this router restart/reconnect or the storage restart. +-- @param opts Options. See below: +-- - timeout - a number of seconds. Note that the refs may end up being kept +-- on the storages during this entire timeout if something goes wrong. +-- For instance, network issues appear. This means better not use a +-- value bigger than necessary. A stuck infinite ref can only be dropped +-- by this router restart/reconnect or the storage restart. +-- - return_raw - true/false. When specified, the returned values are not +-- decoded into Lua native objects and stay packed as a msgpack object +-- (see 'msgpack' module). By default all is decoded. That might be +-- undesirable when the returned values are going to be forwarded back +-- into the network anyway. +-- - bucket_ids - an array of bucket IDs which have to be covered by +-- Map-Reduce. By default the whole cluster is covered. -- -- @return In case of success - a map with replicaset ID (UUID or name) keys and -- values being what the function returned from the replicaset. @@ -1095,84 +1107,25 @@ end -- found even though all replicasets were scanned. -- local function router_map_callrw(router, func, args, opts) - local replicasets, err, err_id, map, futures, rid - local timeout, do_return_raw + local replicasets_to_map, err, err_id, map, futures, rid + local timeout, do_return_raw, bucket_ids if opts then timeout = opts.timeout or consts.CALL_TIMEOUT_MIN do_return_raw = opts.return_raw + bucket_ids = opts.bucket_ids else timeout = consts.CALL_TIMEOUT_MIN end - timeout, err, err_id, rid, futures, replicasets = - router_ref_storage_all(router, timeout) - if timeout then - map, err, err_id = replicasets_map_reduce(replicasets, rid, func, - args, { - timeout = timeout, return_raw = do_return_raw - }) - if map then - return map - end - end - replicasets_map_cancel_refs(replicasets, futures, rid) - err = lerror.make(err) - return nil, err, err_id -end - --- --- Partial Ref-Map-Reduce. The given function is called on the masters --- with a guarantee that in case of success it was executed exactly once --- on all storages that contain the given buckets. Execution on masters --- is chosen to allow buckets be accessed for reads and writes during the --- call. --- --- The execution consists of the following stages: --- --- Ref stage. We ref all the storages with the given buckets. It is --- an iterative process, because the router's cache may be outdated, --- and we need to discover the moved buckets. At the end of this stage --- we have refed all the storages containing the given buckets with the --- given timeout. --- --- Map stage. We call the given function on all the storages we have --- refed. On the storage side we use a `storage_map` function, which --- switches the timeout ref to the manual mode and unrefs the storage --- after the function is called. --- --- Reduce stage. We collect the results of the function call. --- --- @param router Instance to use. --- @param bucket_ids List of bucket identifiers. --- @param func Name of the function to call. --- @param args Function arguments passed in netbox style (as an array). --- @param opts Can only contain 'timeout' as a number of seconds. Note that the --- refs may end up being kept on the storages during this entire timeout if --- something goes wrong. For instance, network issues appear. This means --- better not use a value bigger than necessary. A stuck infinite ref can --- only be dropped by this router restart/reconnect or the storage restart. --- --- @return In case of success - a map with replicaset UUID keys and values being --- what the function returned from the replicaset. --- --- @return In case of an error - nil, error object, optional UUID of the --- replicaset where the error happened. UUID may be not present if it wasn't --- about concrete replicaset. For example, not all buckets were found even --- though all replicasets were scanned. --- -local function router_map_part_callrw(router, bucket_ids, func, args, opts) - local timeout, err, err_id, map, futures, do_return_raw, rid - local replicasets_to_map - if opts then - timeout = opts.timeout or consts.CALL_TIMEOUT_MIN - do_return_raw = opts.return_raw + if bucket_ids then + timeout, err, err_id, rid, futures, replicasets_to_map = + router_ref_storage_by_buckets(router, bucket_ids, timeout) else - timeout = consts.CALL_TIMEOUT_MIN + timeout, err, err_id, rid, futures, replicasets_to_map = + router_ref_storage_all(router, timeout) end - timeout, err, err_id, rid, futures, replicasets_to_map = - router_ref_storage_by_buckets(router, bucket_ids, timeout) if timeout then - map, err, err_id = replicasets_map_reduce( - replicasets_to_map, rid, func, args, { + map, err, err_id = replicasets_map_reduce(replicasets_to_map, rid, func, + args, { timeout = timeout, return_raw = do_return_raw }) if map then @@ -2029,7 +1982,6 @@ local router_mt = { callre = router_make_api(router_callre), callbre = router_make_api(router_callbre), map_callrw = router_make_api(router_map_callrw), - map_part_callrw = router_make_api(router_map_part_callrw), route = router_make_api(router_route), routeall = router_make_api(router_routeall), bucket_id = router_make_api(router_bucket_id), From 59deb2824721ae80ab1a1a6a5e4e32a317df08e7 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Thu, 3 Oct 2024 12:26:47 +0200 Subject: [PATCH 13/17] test: move test_map_callrw_raw() to another file Lets merge all map_callrw() tests into a single file. NO_DOC=internal --- test/router-luatest/map_part_test.lua | 59 ++++++++++++++++++++ test/router-luatest/router_test.lua | 80 --------------------------- 2 files changed, 59 insertions(+), 80 deletions(-) diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua index cdaccd28..b443f13a 100644 --- a/test/router-luatest/map_part_test.lua +++ b/test/router-luatest/map_part_test.lua @@ -417,3 +417,62 @@ g.test_map_part_callrw_raw = function(cg) _G.old_do_map = nil end) end + +g.test_map_all_callrw_raw = function(cg) + t.run_only_if(vutil.feature.netbox_return_raw) + -- + -- Successful map. + -- + local res = router_do_map(cg.router, {3}, { + timeout = vtest.wait_timeout, + return_raw = true, + }) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid, 3}}, + [cg.rs2_uuid] = {{cg.rs2_uuid, 3}}, + }) + t.assert_equals(res.val_type, 'userdata') + t.assert(not res.err) + -- + -- Successful map, but one of the storages returns nothing. + -- + cg.replica_2_a:exec(function() + rawset(_G, 'old_do_map', _G.do_map) + _G.do_map = function() + return + end + end) + res = router_do_map(cg.router, {}, { + timeout = vtest.wait_timeout, + return_raw = true, + }) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid}}, + }) + -- + -- Error at map stage. + -- + cg.replica_2_a:exec(function() + _G.do_map = function() + return box.error(box.error.PROC_LUA, "map_err") + end + end) + res = router_do_map(cg.router, {}, { + timeout = vtest.wait_timeout, + return_raw = true, + }) + t.assert_equals(res.val, nil) + t.assert_covers(res.err, { + code = box.error.PROC_LUA, + type = 'ClientError', + message = 'map_err' + }, 'error object') + t.assert_equals(res.err_uuid, cg.rs2_uuid, 'error uuid') + -- + -- Cleanup. + -- + cg.replica_2_a:exec(function() + _G.do_map = _G.old_do_map + _G.old_do_map = nil + end) +end diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua index 022f578c..cb4456fa 100644 --- a/test/router-luatest/router_test.lua +++ b/test/router-luatest/router_test.lua @@ -206,86 +206,6 @@ g.test_return_raw = function(g) end) end -g.test_map_callrw_raw = function(g) - t.run_only_if(vutil.feature.netbox_return_raw) - - local create_map_func_f = function(res1) - rawset(_G, 'do_map', function(res2) - return {res1, res2} - end) - end - g.replica_1_a:exec(create_map_func_f, {1}) - g.replica_2_a:exec(create_map_func_f, {2}) - -- - -- Successful map. - -- - local res = g.router:exec(function() - local val, err = ivshard.router.map_callrw( - 'do_map', imsgpack.object({3}), {timeout = iwait_timeout, - return_raw = true}) - local _, one_map = next(val) - return { - val = val, - map_type = type(one_map), - err = err, - } - end) - local rs1_uuid = g.replica_1_a:replicaset_uuid() - local rs2_uuid = g.replica_2_a:replicaset_uuid() - local expected = { - [rs1_uuid] = {{1, 3}}, - [rs2_uuid] = {{2, 3}}, - } - t.assert_equals(res.val, expected, 'map callrw success') - t.assert_equals(res.map_type, 'userdata', 'values are msgpacks') - t.assert(not res.err, 'no error') - -- - -- Successful map, but one of the storages returns nothing. - -- - g.replica_2_a:exec(function() - _G.do_map = function() - return - end - end) - res = g.router:exec(function() - return ivshard.router.map_callrw('do_map', {}, {timeout = iwait_timeout, - return_raw = true}) - end) - expected = { - [rs1_uuid] = {{1}}, - } - t.assert_equals(res, expected, 'map callrw without one value success') - -- - -- Error at map stage. - -- - g.replica_2_a:exec(function() - _G.do_map = function() - return box.error(box.error.PROC_LUA, "map_err") - end - end) - local err, err_uuid - res, err, err_uuid = g.router:exec(function() - return ivshard.router.map_callrw('do_map', {}, {timeout = iwait_timeout, - return_raw = true}) - end) - t.assert(res == nil, 'no result') - t.assert_covers(err, { - code = box.error.PROC_LUA, - type = 'ClientError', - message = 'map_err' - }, 'error object') - t.assert_equals(err_uuid, rs2_uuid, 'error uuid') - -- - -- Cleanup. - -- - g.replica_1_a:exec(function() - _G.do_map = nil - end) - g.replica_2_a:exec(function() - _G.do_map = nil - end) -end - g.test_group_buckets = function(g) local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2) local val, err = g.router:exec(function(bid1, bid2) From ca39caca4421f7e6c9828ad0d4aef44ff838f58b Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Thu, 3 Oct 2024 12:28:17 +0200 Subject: [PATCH 14/17] test: +1 replicaset to map_callrw() tests When there were only 2, all cases would either cover a single replicaset or "all" of them. Lets make them 3, so that some tests actually cover a part of a cluster which is not just a single replicaset. NO_DOC=internal --- test/router-luatest/map_part_test.lua | 30 ++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua index b443f13a..2fd28811 100644 --- a/test/router-luatest/map_part_test.lua +++ b/test/router-luatest/map_part_test.lua @@ -22,8 +22,16 @@ local cfg_template = { replica_2_b = {}, }, }, + { + replicas = { + replica_3_a = { + master = true, + }, + replica_3_b = {}, + }, + }, }, - bucket_count = 10, + bucket_count = 30, test_user_grant_range = 'super', } local global_cfg = vtest.config_new(cfg_template) @@ -76,6 +84,7 @@ g.before_all(function(cg) t.assert_equals(err, nil) cg.rs1_uuid = cg.replica_1_a:replicaset_uuid() cg.rs2_uuid = cg.replica_2_a:replicaset_uuid() + cg.rs3_uuid = cg.replica_3_a:replicaset_uuid() end) g.after_all(function(cg) @@ -116,6 +125,23 @@ g.test_map_part_multi_rs = function(cg) }) end +g.test_map_part_all_rs = function(cg) + local bid1 = vtest.storage_first_bucket(cg.replica_1_a) + local bid2 = vtest.storage_first_bucket(cg.replica_2_a) + local bid3 = vtest.storage_first_bucket(cg.replica_3_a) + local res = router_do_map(cg.router, {123}, { + timeout = vtest.wait_timeout, + bucket_ids = {bid1, bid2, bid3}, + }) + t.assert_equals(res.err, nil) + t.assert_equals(res.err_uuid, nil) + t.assert_equals(res.val, { + [cg.rs1_uuid] = {{cg.rs1_uuid, 123}}, + [cg.rs2_uuid] = {{cg.rs2_uuid, 123}}, + [cg.rs3_uuid] = {{cg.rs3_uuid, 123}}, + }) +end + g.test_map_part_ref = function(cg) -- First move some buckets from rs1 to rs2 and then pause gc on rs1. -- As a result, the buckets will be in the SENT state on rs1 and @@ -430,6 +456,7 @@ g.test_map_all_callrw_raw = function(cg) t.assert_equals(res.val, { [cg.rs1_uuid] = {{cg.rs1_uuid, 3}}, [cg.rs2_uuid] = {{cg.rs2_uuid, 3}}, + [cg.rs3_uuid] = {{cg.rs3_uuid, 3}}, }) t.assert_equals(res.val_type, 'userdata') t.assert(not res.err) @@ -448,6 +475,7 @@ g.test_map_all_callrw_raw = function(cg) }) t.assert_equals(res.val, { [cg.rs1_uuid] = {{cg.rs1_uuid}}, + [cg.rs3_uuid] = {{cg.rs3_uuid}}, }) -- -- Error at map stage. From ead3770e6e91873380167dcc0e8517fddd603fa6 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Thu, 3 Oct 2024 13:07:52 +0200 Subject: [PATCH 15/17] storage: fix moved buckets check 'moved_buckets' function would treat as "moved" all the buckets which are not strictly ACTIVE. But that isn't optimal. Also the 'moved_buckets' func would assume that when ref creation is started, by the end of it the buckets stay unchanged. That isn't true. Thirdly, the moved buckets could contain the destination where did they move to. Returning this to the router would make the re-discovery faster. Fourthly, PINNED buckets were not considered ACTIVE. The commit fixes all these issues. Firstly, when a bucket is SENDING, returning an error right away isn't good. The router would just keep retrying then, without any progress. The bucket is actually here, it is not moved yet. Better let the storage try to take a ref. Then one of 2 results are possible: - It waits without useless active retries. And then SENDING fails and becomes ACTIVE. Ref is taken, all good. - It waits without useless active retries. SENDING turns into SENT. Ref is taken for the other buckets, and this one is reported as moved. Similar logic applies to RECEIVING. Secondly, after a ref is taken, the not-moved buckets could become moved. Need to re-check them before returning the ref. Luckily, the storage can use bucket_generation to avoid this double-check when nothing changed in _bucket. NO_DOC=bugfix --- test/storage-luatest/storage_1_test.lua | 292 ++++++++++++++++++++++-- vshard/router/init.lua | 14 +- vshard/storage/init.lua | 50 ++-- 3 files changed, 321 insertions(+), 35 deletions(-) diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua index 0e8d8bd4..1056f2ac 100644 --- a/test/storage-luatest/storage_1_test.lua +++ b/test/storage-luatest/storage_1_test.lua @@ -226,7 +226,7 @@ test_group.test_bucket_skip_validate = function(g) end) end -test_group.test_ref_with_buckets = function(g) +test_group.test_ref_with_buckets_basic = function(g) g.replica_1_a:exec(function() local lref = require('vshard.storage.ref') local res, err, _ @@ -277,7 +277,11 @@ test_group.test_ref_with_buckets = function(g) ilt.assert_equals(err, nil) ilt.assert_equals(res, { rid = rid, - moved = {bucket_count + 1, bucket_count + 2, bucket_count + 3} + moved = { + {id = bucket_count + 1}, + {id = bucket_count + 2}, + {id = bucket_count + 3}, + } }) _, err = ivshard.storage._call('storage_unref', rid) ilt.assert_equals(err, nil) @@ -290,40 +294,296 @@ test_group.test_ref_with_buckets = function(g) {bucket_count + 1, bucket_count + 2} ) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}}) + ilt.assert_equals(res, {moved = { + {id = bucket_count + 1}, + {id = bucket_count + 2}, + }}) ilt.assert_equals(lref.count, 0) + end) +end +test_group.test_ref_with_buckets_timeout = function(g) + g.replica_1_a:exec(function() + local lref = require('vshard.storage.ref') + local rid = 42 + local bids = _G.get_n_buckets(2) + -- -- Timeout when some buckets aren't writable. Doesn't have to be the -- same buckets as for moving. + -- _G.bucket_recovery_pause() box.space._bucket:update( {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}}) - res, err = ivshard.storage._call( + local res, err = ivshard.storage._call( 'storage_ref_with_buckets', rid, 0.01, {bids[2]}) box.space._bucket:update( {bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) - _G.bucket_recovery_continue() t.assert_str_contains(err.message, 'Timeout exceeded') ilt.assert_equals(res, nil) ilt.assert_equals(lref.count, 0) + _G.bucket_recovery_continue() end) end -test_group.test_moved_buckets = function(g) +test_group.test_ref_with_buckets_return_last_known_dst = function(g) g.replica_1_a:exec(function() + local lref = require('vshard.storage.ref') + local rid = 42 + local bid = _G.get_first_bucket() + local luuid = require('uuid') + local id = luuid.str() + -- Make the bucket follow the correct state sequence. Another way is + -- validated and not allowed. + _G.bucket_recovery_pause() + _G.bucket_gc_pause() + box.space._bucket:update( + {bid}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}}) + box.space._bucket:update( + {bid}, {{'=', 2, ivconst.BUCKET.SENT}}) local res, err = ivshard.storage._call( - 'moved_buckets', - {_G.get_first_bucket()} - ) + 'storage_ref_with_buckets', rid, iwait_timeout, {bid}) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {moved = {}}) + ilt.assert_equals(res, {moved = {{ + id = bid, + dst = id, + status = ivconst.BUCKET.SENT, + }}}) + ilt.assert_equals(lref.count, 0) + -- Cleanup. + _G.bucket_gc_continue() + _G.bucket_gc_wait() + box.space._bucket:insert({bid, ivconst.BUCKET.RECEIVING}) + box.space._bucket:update({bid}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) + _G.bucket_recovery_continue() + end) +end - local bucket_count = ivshard.storage.internal.total_bucket_count - res, err = ivshard.storage._call( - 'moved_buckets', - {_G.get_first_bucket(), bucket_count + 1, bucket_count + 2} - ) +test_group.test_ref_with_buckets_move_part_while_referencing = function(g) + g.replica_1_a:exec(function() + local lref = require('vshard.storage.ref') + local _ + local rid = 42 + local bids = _G.get_n_buckets(3) + local luuid = require('uuid') + local id = luuid.str() + -- + -- Was not moved until ref, but moved while ref was waiting. + -- + _G.bucket_recovery_pause() + _G.bucket_gc_pause() + -- Block the refs for a while. + box.space._bucket:update( + {bids[3]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}}) + -- Start referencing. + local session_id + local f = ifiber.new(function() + session_id = box.session.id() + return ivshard.storage._call('storage_ref_with_buckets', rid, + iwait_timeout, {bids[1], bids[2]}) + end) + f:set_joinable(true) + -- While waiting, one of the target buckets starts moving. + box.space._bucket:update( + {bids[2]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}}) + -- Now they are moved. + box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.SENT}}) + box.space._bucket:update({bids[3]}, {{'=', 2, ivconst.BUCKET.SENT}}) + _G.bucket_gc_continue() + _G.bucket_gc_wait() + local ok, res, err = f:join() + t.assert(ok) + t.assert_equals(err, nil) + ilt.assert_equals(res, { + moved = {{id = bids[2], dst = id}}, + rid = rid, + }) + -- Ref was done, because at least one bucket was ok. + ilt.assert_equals(lref.count, 1) + -- Cleanup. + _, err = lref.del(rid, session_id) + ilt.assert_equals(err, nil) + ilt.assert_equals(lref.count, 0) + box.space._bucket:insert({bids[2], ivconst.BUCKET.RECEIVING}) + box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) + box.space._bucket:insert({bids[3], ivconst.BUCKET.RECEIVING}) + box.space._bucket:update({bids[3]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) + _G.bucket_recovery_continue() + end) +end + +test_group.test_ref_with_buckets_move_all_while_referencing = function(g) + g.replica_1_a:exec(function() + local lref = require('vshard.storage.ref') + local rid = 42 + local bids = _G.get_n_buckets(3) + local luuid = require('uuid') + local id = luuid.str() + -- + -- Now same, but all buckets were moved. No ref should be left then. + -- + _G.bucket_recovery_pause() + _G.bucket_gc_pause() + -- Block the refs for a while. + box.space._bucket:update( + {bids[3]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}}) + -- Start referencing. + local f = ifiber.new(function() + return ivshard.storage._call('storage_ref_with_buckets', rid, + iwait_timeout, {bids[1], bids[2]}) + end) + f:set_joinable(true) + -- While waiting, all the target buckets start moving. + box.space._bucket:update( + {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}}) + box.space._bucket:update( + {bids[2]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}}) + -- Now they are moved. + box.space._bucket:update({bids[1]}, {{'=', 2, ivconst.BUCKET.SENT}}) + box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.SENT}}) + -- And the other bucket doesn't matter. Can revert it back. + box.space._bucket:update({bids[3]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) + _G.bucket_gc_continue() + _G.bucket_gc_wait() + local ok, res, err = f:join() + t.assert(ok) + t.assert_equals(err, nil) + ilt.assert_equals(res, { + moved = { + {id = bids[1], dst = id}, + {id = bids[2], dst = id}, + } + }) + -- Ref was not done, because all the buckets moved out. + ilt.assert_equals(lref.count, 0) + -- Cleanup. + box.space._bucket:insert({bids[1], ivconst.BUCKET.RECEIVING}) + box.space._bucket:update({bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) + box.space._bucket:insert({bids[2], ivconst.BUCKET.RECEIVING}) + box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) + _G.bucket_recovery_continue() + end) +end + +test_group.test_moved_buckets_various_statuses = function(g) + g.replica_1_a:exec(function() + local _bucket = box.space._bucket + -- Make sure that if any bucket status is added/deleted/changed, the + -- test gets an update. + ilt.assert_equals(ivconst.BUCKET, { + ACTIVE = 'active', + PINNED = 'pinned', + SENDING = 'sending', + SENT = 'sent', + RECEIVING = 'receiving', + GARBAGE = 'garbage', + }) + _G.bucket_recovery_pause() + _G.bucket_gc_pause() + local luuid = require('uuid') + -- +1 to delete and make it a 404 bucket. + local bids = _G.get_n_buckets(7) + + -- ACTIVE = bids[1]. + -- + -- PINNED = bids[2]. + local bid_pinned = bids[2] + _bucket:update({bid_pinned}, + {{'=', 2, ivconst.BUCKET.PINNED}}) + + -- SENDING = bids[3]. + local bid_sending = bids[3] + local id_sending = luuid.str() + _bucket:update({bid_sending}, + {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id_sending}}) + + -- SENT = bids[4]. + local bid_sent = bids[4] + local id_sent = luuid.str() + _bucket:update({bid_sent}, + {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id_sent}}) + _bucket:update({bid_sent}, + {{'=', 2, ivconst.BUCKET.SENT}}) + + -- RECEIVING = bids[5]. + local bid_receiving = bids[5] + local id_receiving = luuid.str() + _bucket:update({bid_receiving}, + {{'=', 2, ivconst.BUCKET.SENDING}}) + _bucket:update({bid_receiving}, + {{'=', 2, ivconst.BUCKET.SENT}}) + _bucket:update({bid_receiving}, + {{'=', 2, ivconst.BUCKET.GARBAGE}}) + _bucket:delete({bid_receiving}) + _bucket:insert({bid_receiving, ivconst.BUCKET.RECEIVING, id_receiving}) + + -- GARBAGE = bids[6]. + local bid_garbage = bids[6] + local id_garbage = luuid.str() + _bucket:update({bid_garbage}, + {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id_garbage}}) + _bucket:update({bid_garbage}, + {{'=', 2, ivconst.BUCKET.SENT}}) + _bucket:update({bid_garbage}, + {{'=', 2, ivconst.BUCKET.GARBAGE}}) + + -- NOT EXISTING = bids[7]. + local bid_404 = bids[7] + _bucket:update({bid_404}, + {{'=', 2, ivconst.BUCKET.SENDING}}) + _bucket:update({bid_404}, + {{'=', 2, ivconst.BUCKET.SENT}}) + _bucket:update({bid_404}, + {{'=', 2, ivconst.BUCKET.GARBAGE}}) + _bucket:delete({bid_404}) + + local res, err = ivshard.storage._call('moved_buckets', bids) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}}) + ilt.assert(res and res.moved) + ilt.assert_items_equals(res.moved, { + { + id = bid_sent, + dst = id_sent, + status = ivconst.BUCKET.SENT, + }, + { + id = bid_garbage, + dst = id_garbage, + status = ivconst.BUCKET.GARBAGE, + }, + { + id = bid_404, + } + }) + -- + -- Cleanup. + -- + -- NOT EXISTING. + _bucket:insert({bid_404, ivconst.BUCKET.RECEIVING}) + _bucket:update({bid_404}, + {{'=', 2, ivconst.BUCKET.ACTIVE}}) + -- GARBAGE. + _bucket:delete({bid_garbage}) + _bucket:insert({bid_garbage, ivconst.BUCKET.RECEIVING}) + _bucket:update({bid_garbage}, + {{'=', 2, ivconst.BUCKET.ACTIVE}}) + -- RECEIVING. + _bucket:update({bid_receiving}, + {{'=', 2, ivconst.BUCKET.ACTIVE}}) + -- SENT. + _bucket:update({bid_sent}, + {{'=', 2, ivconst.BUCKET.GARBAGE}}) + _bucket:delete({bid_sent}) + _bucket:insert({bid_sent, ivconst.BUCKET.RECEIVING}) + _bucket:update({bid_sent}, + {{'=', 2, ivconst.BUCKET.ACTIVE}}) + -- SENDING. + _bucket:update({bid_sending}, + {{'=', 2, ivconst.BUCKET.ACTIVE}}) + -- PINNED. + _bucket:update({bid_pinned}, + {{'=', 2, ivconst.BUCKET.ACTIVE}}) + + _G.bucket_recovery_continue() + _G.bucket_gc_continue() end) end diff --git a/vshard/router/init.lua b/vshard/router/init.lua index 0430b596..2986c9e9 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -947,9 +947,17 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout) err_id = id goto fail end - for _, bucket_id in pairs(res.moved) do - bucket_reset(router, bucket_id) - table.insert(bucket_ids, bucket_id) + for _, bucket in pairs(res.moved) do + local bid = bucket.id + local dst = bucket.dst + -- 'Reset' regardless of 'set'. So as not to + -- bother with 'set' errors. If it fails, then + -- won't matter. It is a best-effort thing. + bucket_reset(router, bid) + if dst ~= nil then + bucket_set(router, bid, dst) + end + table.insert(bucket_ids, bid) end if res.rid then assert(not replicasets_to_map[id]) diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index a8fa3a07..f49b564d 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -3181,24 +3181,40 @@ local function storage_ref(rid, timeout) end -- --- Lookup for absent active buckets. --- --- @param bucket_ids List of bucket identifiers. --- @return A dictionary with a list of bucket identifiers which are --- not present on the current instance. +-- Lookup buckets which are definitely not going to recover into ACTIVE state +-- under any circumstances. -- local function storage_moved_buckets(bucket_ids) - local status = consts.BUCKET + local allstatus = consts.BUCKET local moved_buckets = {} for _, bucket_id in pairs(bucket_ids) do local bucket = box.space._bucket:get{bucket_id} - if not bucket or bucket.status ~= status.ACTIVE then - table.insert(moved_buckets, bucket_id) + local is_moved + if not bucket then + is_moved = true + else + local status = bucket.status + is_moved = status == allstatus.GARBAGE or status == allstatus.SENT + end + if is_moved then + table.insert(moved_buckets, { + id = bucket_id, + dst = bucket and bucket.destination or M.route_map[bucket_id], + status = bucket and bucket.status, + }) end end return { moved = moved_buckets } end +-- +-- Drop a storage ref from the current box session. Is used as a part of +-- Map-Reduce API. +-- +local function storage_unref(rid) + return lref.del(rid, box.session.id()) +end + -- -- Bind a new storage ref to the current box session and check -- that all the buckets are present. Is used as a part of the @@ -3219,21 +3235,23 @@ local function storage_ref_with_buckets(rid, timeout, bucket_ids) -- ref. return {rid = nil, moved = moved} end + local bucket_generation = M.bucket_generation local ok, err = storage_ref(rid, timeout) if not ok then return nil, err end + if M.bucket_generation ~= bucket_generation then + -- Need to redo it. Otherwise there is a risk that some buckets were + -- moved while waiting for the ref. + moved = storage_moved_buckets(bucket_ids).moved + if #moved == #bucket_ids then + storage_unref(rid) + rid = nil + end + end return {rid = rid, moved = moved} end --- --- Drop a storage ref from the current box session. Is used as a part of --- Map-Reduce API. --- -local function storage_unref(rid) - return lref.del(rid, box.session.id()) -end - -- -- Execute a user's function under an infinite storage ref protecting from -- bucket moves. The ref should exist before, and is deleted after, regardless From f316ad1b6daf80edb64ac99c91c9175cb45d87b9 Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Wed, 9 Oct 2024 23:58:02 +0200 Subject: [PATCH 16/17] storage: fix moved buckets ref check During the partial Map-Reduce the router might visit some storages more than once. Happens when after a ref on storage-A another storage-B reports A as having taken some buckets. Then router would come back to A to confirm that. The storage still must hold its previously created ref in order for such checks to make any sense. Otherwise any of the previously confirmed buckets could have had escaped by now. Without the ref-checking the router could reach the Map stage and send some Map requests even though could detect earlier, that not all storages would succeed. This wasn't strictly speaking a bug, but it was clearly suboptimal behaviour leading to the requests being executed not on all the needed storages while the others would report errors. NO_DOC=internal --- test/router-luatest/map_part_test.lua | 23 ++++++++++++- test/storage-luatest/storage_1_test.lua | 38 +++++++++++----------- test/upgrade/upgrade.result | 4 +-- vshard/router/init.lua | 10 +++--- vshard/storage/init.lua | 43 ++++++++++++++++++------- 5 files changed, 80 insertions(+), 38 deletions(-) diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua index 2fd28811..2de0291c 100644 --- a/test/router-luatest/map_part_test.lua +++ b/test/router-luatest/map_part_test.lua @@ -267,6 +267,18 @@ g.test_map_part_ref_timeout = function(cg) end end, {{bid1, bid2, bid3, bid4}}) + -- Count the map calls. The loss of ref must be detected before the + -- map-stage. + local _, err = vtest.cluster_exec_each_master(cg, function() + rawset(_G, 'old_do_map', _G.do_map) + rawset(_G, 'map_count', 0) + _G.do_map = function(...) + _G.map_count = _G.map_count + 1 + return _G.old_do_map(...) + end + end) + t.assert_equals(err, nil) + -- Send bucket so the router thinks: -- rs1: {b1, b2}, rs2: {b3, b4} -- and actually the state is: @@ -324,11 +336,20 @@ g.test_map_part_ref_timeout = function(cg) t.assert_equals(res.err_uuid, cg.rs2_uuid) -- Make sure there are no references left. - local _, err = vtest.cluster_exec_each(cg, function() + _, err = vtest.cluster_exec_each(cg, function() ilt.assert_equals(require('vshard.storage.ref').count, 0) end) t.assert_equals(err, nil) + -- No maps had a chance to get executed. + _, err = vtest.cluster_exec_each_master(cg, function() + ilt.assert_equals(_G.map_count, 0) + _G.do_map = _G.old_do_map + _G.old_do_map = nil + _G.map_count = nil + end) + t.assert_equals(err, nil) + -- Return the bucket back and re-enable discovery on the router. cg.replica_2_a:exec(function(bid, to) _G.bucket_send(bid, to) diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua index 1056f2ac..1e1600b3 100644 --- a/test/storage-luatest/storage_1_test.lua +++ b/test/storage-luatest/storage_1_test.lua @@ -236,16 +236,16 @@ test_group.test_ref_with_buckets_basic = function(g) -- No buckets. res, err = ivshard.storage._call( - 'storage_ref_with_buckets', rid, iwait_timeout, {}) + 'storage_ref_make_with_buckets', rid, iwait_timeout, {}) ilt.assert_equals(err, nil) ilt.assert_equals(res, {moved = {}}) ilt.assert_equals(lref.count, 0) -- Check for a single ok bucket. res, err = ivshard.storage._call( - 'storage_ref_with_buckets', rid, iwait_timeout, {bids[1]}) + 'storage_ref_make_with_buckets', rid, iwait_timeout, {bids[1]}) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {rid = rid, moved = {}}) + ilt.assert_equals(res, {is_done = true, moved = {}}) ilt.assert_equals(lref.count, 1) _, err = ivshard.storage._call('storage_unref', rid) ilt.assert_equals(err, nil) @@ -253,17 +253,19 @@ test_group.test_ref_with_buckets_basic = function(g) -- Check for multiple ok buckets. res, err = ivshard.storage._call( - 'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[2]}) + 'storage_ref_make_with_buckets', rid, iwait_timeout, + {bids[1], bids[2]}) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {rid = rid, moved = {}}) + ilt.assert_equals(res, {is_done = true, moved = {}}) _, err = ivshard.storage._call('storage_unref', rid) ilt.assert_equals(err, nil) -- Check for double referencing. res, err = ivshard.storage._call( - 'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[1]}) + 'storage_ref_make_with_buckets', rid, iwait_timeout, + {bids[1], bids[1]}) ilt.assert_equals(err, nil) - ilt.assert_equals(res, {rid = rid, moved = {}}) + ilt.assert_equals(res, {is_done = true, moved = {}}) ilt.assert_equals(lref.count, 1) _, err = ivshard.storage._call('storage_unref', rid) ilt.assert_equals(err, nil) @@ -271,12 +273,12 @@ test_group.test_ref_with_buckets_basic = function(g) -- Bucket mix. res, err = ivshard.storage._call( - 'storage_ref_with_buckets', rid, iwait_timeout, + 'storage_ref_make_with_buckets', rid, iwait_timeout, {bucket_count + 1, bids[1], bucket_count + 2, bids[2], bucket_count + 3}) ilt.assert_equals(err, nil) ilt.assert_equals(res, { - rid = rid, + is_done = true, moved = { {id = bucket_count + 1}, {id = bucket_count + 2}, @@ -288,7 +290,7 @@ test_group.test_ref_with_buckets_basic = function(g) -- No ref when all buckets are missing. res, err = ivshard.storage._call( - 'storage_ref_with_buckets', + 'storage_ref_make_with_buckets', rid, iwait_timeout, {bucket_count + 1, bucket_count + 2} @@ -315,7 +317,7 @@ test_group.test_ref_with_buckets_timeout = function(g) box.space._bucket:update( {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}}) local res, err = ivshard.storage._call( - 'storage_ref_with_buckets', rid, 0.01, {bids[2]}) + 'storage_ref_make_with_buckets', rid, 0.01, {bids[2]}) box.space._bucket:update( {bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}}) t.assert_str_contains(err.message, 'Timeout exceeded') @@ -341,7 +343,7 @@ test_group.test_ref_with_buckets_return_last_known_dst = function(g) box.space._bucket:update( {bid}, {{'=', 2, ivconst.BUCKET.SENT}}) local res, err = ivshard.storage._call( - 'storage_ref_with_buckets', rid, iwait_timeout, {bid}) + 'storage_ref_make_with_buckets', rid, iwait_timeout, {bid}) ilt.assert_equals(err, nil) ilt.assert_equals(res, {moved = {{ id = bid, @@ -378,7 +380,7 @@ test_group.test_ref_with_buckets_move_part_while_referencing = function(g) local session_id local f = ifiber.new(function() session_id = box.session.id() - return ivshard.storage._call('storage_ref_with_buckets', rid, + return ivshard.storage._call('storage_ref_make_with_buckets', rid, iwait_timeout, {bids[1], bids[2]}) end) f:set_joinable(true) @@ -395,7 +397,7 @@ test_group.test_ref_with_buckets_move_part_while_referencing = function(g) t.assert_equals(err, nil) ilt.assert_equals(res, { moved = {{id = bids[2], dst = id}}, - rid = rid, + is_done = true, }) -- Ref was done, because at least one bucket was ok. ilt.assert_equals(lref.count, 1) @@ -428,7 +430,7 @@ test_group.test_ref_with_buckets_move_all_while_referencing = function(g) {bids[3]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}}) -- Start referencing. local f = ifiber.new(function() - return ivshard.storage._call('storage_ref_with_buckets', rid, + return ivshard.storage._call('storage_ref_make_with_buckets', rid, iwait_timeout, {bids[1], bids[2]}) end) f:set_joinable(true) @@ -536,10 +538,8 @@ test_group.test_moved_buckets_various_statuses = function(g) {{'=', 2, ivconst.BUCKET.GARBAGE}}) _bucket:delete({bid_404}) - local res, err = ivshard.storage._call('moved_buckets', bids) - ilt.assert_equals(err, nil) - ilt.assert(res and res.moved) - ilt.assert_items_equals(res.moved, { + local moved = ivshard.storage.internal.bucket_get_moved(bids) + ilt.assert_items_equals(moved, { { id = bid_sent, dst = id_sent, diff --git a/test/upgrade/upgrade.result b/test/upgrade/upgrade.result index 5e8e71f5..e0c9469b 100644 --- a/test/upgrade/upgrade.result +++ b/test/upgrade/upgrade.result @@ -174,13 +174,13 @@ vshard.storage._call('test_api', 1, 2, 3) | - - bucket_recv | - bucket_test_gc | - info - | - moved_buckets | - rebalancer_apply_routes | - rebalancer_request_state | - recovery_bucket_stat | - storage_map | - storage_ref - | - storage_ref_with_buckets + | - storage_ref_check_with_buckets + | - storage_ref_make_with_buckets | - storage_unref | - test_api | - 1 diff --git a/vshard/router/init.lua b/vshard/router/init.lua index 2986c9e9..6c2dc14a 100644 --- a/vshard/router/init.lua +++ b/vshard/router/init.lua @@ -919,9 +919,11 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout) if replicasets_to_map[id] then -- Replicaset is already referenced on a previous iteration. -- Simply get the moved buckets without double referencing. - args_ref = {'moved_buckets', buckets} + args_ref = { + 'storage_ref_check_with_buckets', rid, buckets} else - args_ref = {'storage_ref_with_buckets', rid, timeout, buckets} + args_ref = { + 'storage_ref_make_with_buckets', rid, timeout, buckets} end res, err = replicasets_all[id]:callrw('vshard.storage._call', args_ref, opts_async) @@ -941,7 +943,7 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout) err_id = id goto fail end - -- Ref returns nil,err or {rid, moved}. + -- Ref returns nil,err or {is_done, moved}. res, err = res[1], res[2] if res == nil then err_id = id @@ -959,7 +961,7 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout) end table.insert(bucket_ids, bid) end - if res.rid then + if res.is_done then assert(not replicasets_to_map[id]) -- If there are no buckets on the replicaset, it would not be -- referenced. diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua index f49b564d..1c174663 100644 --- a/vshard/storage/init.lua +++ b/vshard/storage/init.lua @@ -3184,9 +3184,9 @@ end -- Lookup buckets which are definitely not going to recover into ACTIVE state -- under any circumstances. -- -local function storage_moved_buckets(bucket_ids) +local function bucket_get_moved(bucket_ids) local allstatus = consts.BUCKET - local moved_buckets = {} + local res = {} for _, bucket_id in pairs(bucket_ids) do local bucket = box.space._bucket:get{bucket_id} local is_moved @@ -3197,14 +3197,14 @@ local function storage_moved_buckets(bucket_ids) is_moved = status == allstatus.GARBAGE or status == allstatus.SENT end if is_moved then - table.insert(moved_buckets, { + table.insert(res, { id = bucket_id, dst = bucket and bucket.destination or M.route_map[bucket_id], status = bucket and bucket.status, }) end end - return { moved = moved_buckets } + return res end -- @@ -3228,12 +3228,12 @@ end -- are absent, the reference is not created and a nil reference id -- with the list of absent buckets is returned. -- -local function storage_ref_with_buckets(rid, timeout, bucket_ids) - local moved = storage_moved_buckets(bucket_ids).moved +local function storage_ref_make_with_buckets(rid, timeout, bucket_ids) + local moved = bucket_get_moved(bucket_ids) if #moved == #bucket_ids then -- If all the passed buckets are absent, there is no need to create a -- ref. - return {rid = nil, moved = moved} + return {moved = moved} end local bucket_generation = M.bucket_generation local ok, err = storage_ref(rid, timeout) @@ -3243,13 +3243,31 @@ local function storage_ref_with_buckets(rid, timeout, bucket_ids) if M.bucket_generation ~= bucket_generation then -- Need to redo it. Otherwise there is a risk that some buckets were -- moved while waiting for the ref. - moved = storage_moved_buckets(bucket_ids).moved + moved = bucket_get_moved(bucket_ids) if #moved == #bucket_ids then storage_unref(rid) - rid = nil + return {moved = moved} end end - return {rid = rid, moved = moved} + return {is_done = true, moved = moved} +end + +-- +-- Check which buckets from the given list are moved out of this storage, while +-- also making sure that the storage-ref is still in place. +-- +-- Partial Map-Reduce makes a ref on the storages having any of the needed +-- buckets, but then can come back if other storages report the already visited +-- ones as having the needed buckets. Only makes sense to check, if this storage +-- still holds the ref. Otherwise its previous guarantees given during the ref +-- creation are all gone. +-- +local function storage_ref_check_with_buckets(rid, bucket_ids) + local ok, err = lref.check(rid, box.session.id()) + if not ok then + return nil, err + end + return {moved = bucket_get_moved(bucket_ids)} end -- @@ -3298,9 +3316,9 @@ service_call_api = setmetatable({ rebalancer_apply_routes = rebalancer_apply_routes, rebalancer_request_state = rebalancer_request_state, recovery_bucket_stat = recovery_bucket_stat, - moved_buckets = storage_moved_buckets, storage_ref = storage_ref, - storage_ref_with_buckets = storage_ref_with_buckets, + storage_ref_make_with_buckets = storage_ref_make_with_buckets, + storage_ref_check_with_buckets = storage_ref_check_with_buckets, storage_unref = storage_unref, storage_map = storage_map, info = storage_service_info, @@ -4182,6 +4200,7 @@ M.bucket_state_edges = bucket_state_edges M.bucket_are_all_rw = bucket_are_all_rw_public M.bucket_generation_wait = bucket_generation_wait +M.bucket_get_moved = bucket_get_moved lregistry.storage = M -- From 08aa3804b28276bdfb2611971fa969af89123c2b Mon Sep 17 00:00:00 2001 From: Vladislav Shpilevoy Date: Thu, 3 Oct 2024 13:08:35 +0200 Subject: [PATCH 17/17] test: rename map_part_test to map_callrw_test It tests not only partial Map-Reduce. It covers a bit of the full one as well. NO_DOC=internal --- test/router-luatest/{map_part_test.lua => map_callrw_test.lua} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/router-luatest/{map_part_test.lua => map_callrw_test.lua} (100%) diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_callrw_test.lua similarity index 100% rename from test/router-luatest/map_part_test.lua rename to test/router-luatest/map_callrw_test.lua