From cea68f3c6769dc65183361fb0b752a83977065d4 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Mon, 23 Sep 2024 23:45:28 +0200
Subject: [PATCH 01/17] test: use vardir for repository copying

Upgrade-tests use git to clone the current repository and
check the necessary versions out.

The cloned repository was always saved in vshard/test/var, even
when the actual --var argument for test-run.py was /tmp/var.

Lets better make the copied code also stored by the path given
in --var.

NO_DOC=internal
---
 test/lua_libs/util.lua                 |  4 +++-
 test/reload_evolution/storage.result   |  4 ++--
 test/reload_evolution/storage.test.lua |  4 ++--
 test/reload_evolution/storage_1_a.lua  | 12 +++---------
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/test/lua_libs/util.lua b/test/lua_libs/util.lua
index 3f435ccd..fd9d947e 100644
--- a/test/lua_libs/util.lua
+++ b/test/lua_libs/util.lua
@@ -181,8 +181,10 @@ if not BUILDDIR then
     BUILDDIR = SOURCEDIR
 end
 
+local VARDIR = fio.abspath(os.getenv('VARDIR') or './')
+
 local function git_checkout(dst_dir, version)
-    local vshard_copy_path = BUILDDIR..'/test/var/'..dst_dir
+    local vshard_copy_path = VARDIR..'/'..dst_dir
     -- Cleanup the directory after a previous build.
     os.execute('rm -rf ' .. vshard_copy_path)
     -- `git worktree` cannot be used because PACKPACK mounts
diff --git a/test/reload_evolution/storage.result b/test/reload_evolution/storage.result
index 4a1b3a09..74cf355a 100644
--- a/test/reload_evolution/storage.result
+++ b/test/reload_evolution/storage.result
@@ -24,10 +24,10 @@ REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
 REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
 ---
 ...
-test_run:create_cluster(REPLICASET_1, 'reload_evolution')
+test_run:create_cluster(REPLICASET_1, 'reload_evolution', {args = vshard_copy_path})
 ---
 ...
-test_run:create_cluster(REPLICASET_2, 'reload_evolution')
+test_run:create_cluster(REPLICASET_2, 'reload_evolution', {args = vshard_copy_path})
 ---
 ...
 util = require('util')
diff --git a/test/reload_evolution/storage.test.lua b/test/reload_evolution/storage.test.lua
index 3800309a..02a9cca6 100644
--- a/test/reload_evolution/storage.test.lua
+++ b/test/reload_evolution/storage.test.lua
@@ -12,8 +12,8 @@ vshard_copy_path = util.git_checkout('vshard_git_tree_copy',
 
 REPLICASET_1 = { 'storage_1_a', 'storage_1_b' }
 REPLICASET_2 = { 'storage_2_a', 'storage_2_b' }
-test_run:create_cluster(REPLICASET_1, 'reload_evolution')
-test_run:create_cluster(REPLICASET_2, 'reload_evolution')
+test_run:create_cluster(REPLICASET_1, 'reload_evolution', {args = vshard_copy_path})
+test_run:create_cluster(REPLICASET_2, 'reload_evolution', {args = vshard_copy_path})
 util = require('util')
 util.wait_master(test_run, REPLICASET_1, 'storage_1_a')
 util.wait_master(test_run, REPLICASET_2, 'storage_2_a')
diff --git a/test/reload_evolution/storage_1_a.lua b/test/reload_evolution/storage_1_a.lua
index eb189a0e..f4b19a0e 100755
--- a/test/reload_evolution/storage_1_a.lua
+++ b/test/reload_evolution/storage_1_a.lua
@@ -1,16 +1,10 @@
 #!/usr/bin/env tarantool
 local util = require('util')
 NAME = require('fio').basename(arg[0], '.lua')
-
--- Run one storage on a different vshard version.
--- To do that, place vshard src to
--- BUILDDIR/test/var/vshard_git_tree_copy/.
+local source_path = arg[1]
 original_package_path = package.path
 if NAME == 'storage_2_a' then
-    vshard_copy = util.BUILDDIR .. '/test/var/vshard_git_tree_copy'
-    package.path = string.format(
-        '%s/?.lua;%s/?/init.lua;%s',
-        vshard_copy, vshard_copy, original_package_path
-    )
+    package.path = string.format('%s/?.lua;%s/?/init.lua;%s', source_path,
+                                 source_path, package.path)
 end
 require('storage_template')

From c437f69a928163a946e9e07658b5d6c14d03b1bf Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Thu, 10 Oct 2024 18:45:54 +0200
Subject: [PATCH 02/17] test: enable strict mode in Lua

It makes the unknown variables treated as errors, not as 'nil's.
Otherwise it is too easy to use a wrong variable name somewhere
and get it as nil, and the tests would even pass, but not test
what they are supposed to.

NO_DOC=internal
---
 test/luatest_helpers/vtest.lua | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/luatest_helpers/vtest.lua b/test/luatest_helpers/vtest.lua
index a4fc2091..1a751532 100644
--- a/test/luatest_helpers/vtest.lua
+++ b/test/luatest_helpers/vtest.lua
@@ -8,6 +8,9 @@ local yaml = require('yaml')
 local vcfg = require('vshard.cfg')
 local vrepset = require('vshard.replicaset')
 local log = require('log')
+-- Otherwise in non-Debug builds apparently the unknown variables are treated as
+-- nil-global-variables.
+require('strict').on()
 
 local wait_timeout = 50
 -- Use it in busy-loops like `while !cond do fiber.sleep(busy_step) end`.

From 5e3ebb4840d602f00e3a959ca2f79b1863c61080 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Tue, 24 Sep 2024 21:37:44 +0200
Subject: [PATCH 03/17] router: extract a couple of map-reduce helpers

The only map-reduce function is router.map_callrw(). At some point
there was a task to introduce a new mode of Map-Reduce - partial,
by bucket IDs, not on the whole cluster. For that task was
introduced a new function router.map_part_callrw() which had the
same Map-Reduce and error handling stages. Only Ref stage was
different.

The new helpers in this commit were supposed to reuse some code
between those two map-call functions.

Later it was decided to leave just one map-call function and add a
new option to it. But these new helpers still look useful to have
as separate functions. They make the map-call function really
small and simple.

NO_DOC=internal
NO_TEST=refactoring
---
 vshard/router/init.lua | 174 +++++++++++++++++++++++------------------
 1 file changed, 98 insertions(+), 76 deletions(-)

diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index cbd978ef..f16fb69e 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -761,6 +761,95 @@ local function router_call(router, bucket_id, opts, ...)
                             ...)
 end
 
+--
+-- Perform map-reduce stages on the given set of replicasets. The map expects a
+-- valid ref ID which must be done beforehand.
+--
+local function replicasets_map_reduce(replicasets, rid, func, args, opts)
+    assert(opts)
+    local timeout = opts.timeout or consts.CALL_TIMEOUT_MIN
+    local do_return_raw = opts.return_raw
+    local deadline = fiber_clock() + timeout
+    local opts_map = {is_async = true, return_raw = do_return_raw}
+    local futures = {}
+    local map = {}
+    --
+    -- Map stage: send.
+    --
+    args = {'storage_map', rid, func, args}
+    for _, rs in pairs(replicasets) do
+        local res, err = rs:callrw('vshard.storage._call', args, opts_map)
+        if res == nil then
+            return nil, err, rs.id
+        end
+        futures[rs.id] = res
+    end
+    --
+    -- Map stage: collect.
+    --
+    if do_return_raw then
+        for id, f in pairs(futures) do
+            local res, err = future_wait(f, timeout)
+            if res == nil then
+                return nil, err, id
+            end
+            -- Map returns true,res or nil,err.
+            res = res:iterator()
+            local count = res:decode_array_header()
+            local ok = res:decode()
+            if ok == nil then
+                return nil, res:decode(), id
+            end
+            if count > 1 then
+                map[id] = res:take_array(count - 1)
+            end
+            timeout = deadline - fiber_clock()
+        end
+    else
+        for id, f in pairs(futures) do
+            local res, err = future_wait(f, timeout)
+            if res == nil then
+                return nil, err, id
+            end
+            local ok
+            -- Map returns true,res or nil,err.
+            ok, res = res[1], res[2]
+            if ok == nil then
+                return nil, res, id
+            end
+            if res ~= nil then
+                -- Store as a table so in future it could be extended for
+                -- multireturn.
+                map[id] = {res}
+            end
+            timeout = deadline - fiber_clock()
+        end
+    end
+    return map
+end
+
+--
+-- Cancel whatever pending requests are still waiting for a response and free
+-- the given ref ID on all the affected storages. This is helpful when
+-- map-reduce breaks in the middle. Makes sense to let the refs go to unblock
+-- the rebalancer.
+--
+local function replicasets_map_cancel_refs(replicasets, futures, rid)
+    local opts_async = {is_async = true}
+    for id, f in pairs(futures) do
+        f:discard()
+        -- Best effort to remove the created refs before exiting. Can help if
+        -- the timeout was big and the error happened early.
+        f = replicasets[id]:callrw('vshard.storage._call',
+                                     {'storage_unref', rid}, opts_async)
+        if f ~= nil then
+            -- Don't care waiting for a result - no time for this. But it won't
+            -- affect the request sending if the connection is still alive.
+            f:discard()
+        end
+    end
+end
+
 --
 -- Consistent Map-Reduce. The given function is called on all masters in the
 -- cluster with a guarantee that in case of success it was executed with all
@@ -811,11 +900,10 @@ local function router_map_callrw(router, func, args, opts)
         timeout = consts.CALL_TIMEOUT_MIN
     end
     local deadline = fiber_clock() + timeout
-    local err, err_id, res, ok, map
+    local err, err_id, res, map
     local futures = {}
     local bucket_count = 0
-    local opts_ref = {is_async = true}
-    local opts_map = {is_async = true, return_raw = do_return_raw}
+    local opts_async = {is_async = true}
     local rs_count = 0
     local rid = M.ref_id
     M.ref_id = rid + 1
@@ -834,7 +922,7 @@ local function router_map_callrw(router, func, args, opts)
             goto fail
         end
         res, err = rs:callrw('vshard.storage._call',
-                              {'storage_ref', rid, timeout}, opts_ref)
+                              {'storage_ref', rid, timeout}, opts_async)
         if res == nil then
             err_id = id
             goto fail
@@ -842,7 +930,6 @@ local function router_map_callrw(router, func, args, opts)
         futures[id] = res
         rs_count = rs_count + 1
     end
-    map = table_new(0, rs_count)
     --
     -- Ref stage: collect.
     --
@@ -873,79 +960,14 @@ local function router_map_callrw(router, func, args, opts)
                             router.total_bucket_count - bucket_count)
         goto fail
     end
-    --
-    -- Map stage: send.
-    --
-    args = {'storage_map', rid, func, args}
-    for id, rs in pairs(replicasets) do
-        res, err = rs:callrw('vshard.storage._call', args, opts_map)
-        if res == nil then
-            err_id = id
-            goto fail
-        end
-        futures[id] = res
+    map, err, err_id = replicasets_map_reduce(replicasets, rid, func, args, {
+        timeout = timeout, return_raw = do_return_raw
+    })
+    if map then
+        return map
     end
-    --
-    -- Ref stage: collect.
-    --
-    if do_return_raw then
-        for id, f in pairs(futures) do
-            res, err = future_wait(f, timeout)
-            if res == nil then
-                err_id = id
-                goto fail
-            end
-            -- Map returns true,res or nil,err.
-            res = res:iterator()
-            local count = res:decode_array_header()
-            ok = res:decode()
-            if ok == nil then
-                err = res:decode()
-                err_id = id
-                goto fail
-            end
-            if count > 1 then
-                map[id] = res:take_array(count - 1)
-            end
-            timeout = deadline - fiber_clock()
-        end
-    else
-        for id, f in pairs(futures) do
-            res, err = future_wait(f, timeout)
-            if res == nil then
-                err_id = id
-                goto fail
-            end
-            -- Map returns true,res or nil,err.
-            ok, res = res[1], res[2]
-            if ok == nil then
-                err = res
-                err_id = id
-                goto fail
-            end
-            if res ~= nil then
-                -- Store as a table so in future it could be extended for
-                -- multireturn.
-                map[id] = {res}
-            end
-            timeout = deadline - fiber_clock()
-        end
-    end
-    do return map end
-
 ::fail::
-    for id, f in pairs(futures) do
-        f:discard()
-        -- Best effort to remove the created refs before exiting. Can help if
-        -- the timeout was big and the error happened early.
-        f = replicasets[id]:callrw('vshard.storage._call',
-                                     {'storage_unref', rid}, opts_ref)
-        if f ~= nil then
-            -- Don't care waiting for a result - no time for this. But it won't
-            -- affect the request sending if the connection is still alive.
-            f:discard()
-        end
-    end
+    replicasets_map_cancel_refs(replicasets, futures, rid)
     err = lerror.make(err)
     return nil, err, err_id
 end

From 165dc1801d40125afd2d67d4ef3a25c0f10ab3f6 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Wed, 9 Oct 2024 20:20:08 +0200
Subject: [PATCH 04/17] storage: introduce ref.check()

It ensures the ref is still in place. A read-only operation. It is
going to be used in the future commits about partial map-reduce.
Router will be going potentially more than once to some storages
and at all times it would ensure the ref is still in place.

NO_DOC=internal
---
 test/storage/ref.result   | 42 +++++++++++++++++++++++++++++++++++++++
 test/storage/ref.test.lua | 16 +++++++++++++++
 vshard/storage/ref.lua    | 18 +++++++++++++++++
 3 files changed, 76 insertions(+)

diff --git a/test/storage/ref.result b/test/storage/ref.result
index a81ab5d8..d53176bd 100644
--- a/test/storage/ref.result
+++ b/test/storage/ref.result
@@ -249,6 +249,11 @@ function make_ref(rid, timeout)
 end
  | ---
  | ...
+function check_ref(rid)                                                         \
+    return lref.check(rid, box.session.id())                                    \
+end
+ | ---
+ | ...
 function use_ref(rid)                                                           \
     return lref.use(rid, box.session.id())                                      \
 end
@@ -289,6 +294,43 @@ _ = test_run:switch('storage_1_a')
  | ---
  | ...
 
+-- Check works.
+ok, err = c:call('check_ref', {1})
+ | ---
+ | ...
+assert(ok and not err)
+ | ---
+ | - true
+ | ...
+_ = test_run:switch('storage_2_a')
+ | ---
+ | ...
+assert(lref.count == 1)
+ | ---
+ | - true
+ | ...
+_ = test_run:switch('storage_1_a')
+ | ---
+ | ...
+
+ok, err = c:call('check_ref', {2})
+ | ---
+ | ...
+assert(ok == nil and err.message)
+ | ---
+ | - 'Can not use a storage ref: no ref'
+ | ...
+_ = test_run:switch('storage_2_a')
+ | ---
+ | ...
+assert(lref.count == 1)
+ | ---
+ | - true
+ | ...
+_ = test_run:switch('storage_1_a')
+ | ---
+ | ...
+
 -- Use works.
 c:call('use_ref', {1})
  | ---
diff --git a/test/storage/ref.test.lua b/test/storage/ref.test.lua
index 17264579..76c67a33 100644
--- a/test/storage/ref.test.lua
+++ b/test/storage/ref.test.lua
@@ -106,6 +106,9 @@ small_timeout = 0.001
 function make_ref(rid, timeout)                                                 \
     return lref.add(rid, box.session.id(), timeout)                             \
 end
+function check_ref(rid)                                                         \
+    return lref.check(rid, box.session.id())                                    \
+end
 function use_ref(rid)                                                           \
     return lref.use(rid, box.session.id())                                      \
 end
@@ -124,6 +127,19 @@ _ = test_run:switch('storage_2_a')
 assert(lref.count == 1)
 _ = test_run:switch('storage_1_a')
 
+-- Check works.
+ok, err = c:call('check_ref', {1})
+assert(ok and not err)
+_ = test_run:switch('storage_2_a')
+assert(lref.count == 1)
+_ = test_run:switch('storage_1_a')
+
+ok, err = c:call('check_ref', {2})
+assert(ok == nil and err.message)
+_ = test_run:switch('storage_2_a')
+assert(lref.count == 1)
+_ = test_run:switch('storage_1_a')
+
 -- Use works.
 c:call('use_ref', {1})
 _ = test_run:switch('storage_2_a')
diff --git a/vshard/storage/ref.lua b/vshard/storage/ref.lua
index df7af092..0b517819 100644
--- a/vshard/storage/ref.lua
+++ b/vshard/storage/ref.lua
@@ -247,6 +247,14 @@ local function ref_session_new(sid)
         return true
     end
 
+    local function ref_session_check(self, rid)
+        local ref = ref_map[rid]
+        if not ref then
+            return nil, lerror.vshard(lerror.code.STORAGE_REF_USE, 'no ref')
+        end
+        return true
+    end
+
     --
     -- Ref use means it can't be expired until deleted explicitly. Should be
     -- done when the request affecting the whole storage starts. After use it is
@@ -301,6 +309,7 @@ local function ref_session_new(sid)
         del = ref_session_del,
         gc = ref_session_gc,
         add = ref_session_add,
+        check = ref_session_check,
         use = ref_session_use,
         kill = ref_session_kill,
     }
@@ -364,6 +373,14 @@ local function ref_add(rid, sid, timeout)
     return nil, err
 end
 
+local function ref_check(rid, sid)
+    local session = M.session_map[sid]
+    if not session then
+        return nil, lerror.vshard(lerror.code.STORAGE_REF_USE, 'no session')
+    end
+    return session:check(rid)
+end
+
 local function ref_use(rid, sid)
     local session = M.session_map[sid]
     if not session then
@@ -407,6 +424,7 @@ end
 M.del = ref_del
 M.gc = ref_gc
 M.add = ref_add
+M.check = ref_check
 M.use = ref_use
 M.cfg = ref_cfg
 M.kill = ref_kill_session

From b7436019d44f112d3db5ba877948b10e91a1e1d8 Mon Sep 17 00:00:00 2001
From: Denis Smirnov <sd@picodata.io>
Date: Fri, 24 Nov 2023 12:14:42 +0700
Subject: [PATCH 05/17] storage: implement partial map-reduce API

The storage-size of the Partial Map-Reduce feature.

NO_DOC=later
---
 test/instances/storage.lua              |  17 ++++
 test/storage-luatest/storage_1_test.lua | 114 ++++++++++++++++++++++++
 test/upgrade/upgrade.result             |   2 +
 vshard/storage/init.lua                 |  66 ++++++++++++++
 4 files changed, 199 insertions(+)

diff --git a/test/instances/storage.lua b/test/instances/storage.lua
index 4bbff541..95cd4a7a 100755
--- a/test/instances/storage.lua
+++ b/test/instances/storage.lua
@@ -13,6 +13,7 @@ _G.ivconst = require('vshard.consts')
 _G.ivutil = require('vshard.util')
 _G.iverror = require('vshard.error')
 _G.ivtest = require('test.luatest_helpers.vtest')
+_G.itable_new = require('table.new')
 
 _G.iwait_timeout = _G.ivtest.wait_timeout
 
@@ -69,6 +70,21 @@ local function get_first_bucket()
     return res ~= nil and res.id or nil
 end
 
+local function get_n_buckets(n)
+    if n <= 0 then
+        error('Invalid number of buckets')
+    end
+    local index = box.space._bucket.index.status
+    local ids = _G.itable_new(0, n)
+    for _, tuple in index:pairs(vconst.BUCKET.ACTIVE) do
+        table.insert(ids, tuple.id)
+        if #ids == n then
+            return ids
+        end
+    end
+    error('Not enough buckets')
+end
+
 local function session_set(key, value)
     box.session.storage[key] = value
     return true
@@ -167,6 +183,7 @@ _G.box_error = box_error
 _G.echo = echo
 _G.get_uuid = get_uuid
 _G.get_first_bucket = get_first_bucket
+_G.get_n_buckets = get_n_buckets
 _G.session_set = session_set
 _G.session_get = session_get
 _G.bucket_gc_wait = bucket_gc_wait
diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua
index 0d22df20..2e27f811 100644
--- a/test/storage-luatest/storage_1_test.lua
+++ b/test/storage-luatest/storage_1_test.lua
@@ -225,3 +225,117 @@ test_group.test_bucket_skip_validate = function(g)
         internal.errinj.ERRINJ_SKIP_BUCKET_STATUS_VALIDATE = false
     end)
 end
+
+test_group.test_ref_with_lookup = function(g)
+    g.replica_1_a:exec(function()
+        local res, err, _
+        local timeout = 0.1
+        local rid = 42
+        local bids = _G.get_n_buckets(2)
+        local bid_extra = 3001
+
+        -- Check for a single bucket.
+        res, err = ivshard.storage._call(
+            'storage_ref_with_lookup',
+            rid,
+            timeout,
+            {bids[1]}
+        )
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {rid = rid, moved = {}})
+        _, err = ivshard.storage._call('storage_unref', rid)
+        ilt.assert_equals(err, nil)
+
+        -- Check for multiple buckets.
+        res, err = ivshard.storage._call(
+            'storage_ref_with_lookup',
+            rid,
+            timeout,
+            {bids[1], bids[2]}
+        )
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {rid = rid, moved = {}})
+        _, err = ivshard.storage._call('storage_unref', rid)
+        ilt.assert_equals(err, nil)
+
+        -- Check for double referencing.
+        res, err = ivshard.storage._call(
+            'storage_ref_with_lookup',
+            rid,
+            timeout,
+            {bids[1], bids[1]}
+        )
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {rid = rid, moved = {}})
+        _, err = ivshard.storage._call('storage_unref', rid)
+        ilt.assert_equals(err, nil)
+        _, err = ivshard.storage._call('storage_unref', rid)
+        t.assert_str_contains(err.message,
+                              'Can not delete a storage ref: no ref')
+
+        -- Check for an absent bucket.
+        res, err = ivshard.storage._call(
+            'storage_ref_with_lookup',
+            rid,
+            timeout,
+            {bids[1], bids[2], bid_extra}
+        )
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {rid = rid, moved = {bid_extra}})
+        ivshard.storage._call('storage_unref', rid)
+
+        -- Check that we do not create a reference if there are no buckets.
+        res, err = ivshard.storage._call(
+            'storage_ref_with_lookup',
+            rid,
+            timeout,
+            {bid_extra}
+        )
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {rid = nil, moved = {bid_extra}})
+        res, err = ivshard.storage._call('storage_unref', rid)
+        t.assert_str_contains(err.message,
+                              'Can not delete a storage ref: no ref')
+        ilt.assert_equals(res, nil)
+
+        -- Check for a timeout.
+        -- Emulate a case when all buckets are not writable.
+        local func = ivshard.storage.internal.bucket_are_all_rw
+        ivshard.storage.internal.bucket_are_all_rw = function() return false end
+        res, err = ivshard.storage._call(
+            'storage_ref_with_lookup',
+            rid,
+            timeout,
+            {bids[1]}
+        )
+        ivshard.storage.internal.bucket_are_all_rw = func
+        t.assert_str_contains(err.message, 'Timeout exceeded')
+        ilt.assert_equals(res, nil)
+        -- Check that the reference was not created.
+        res, err = ivshard.storage._call('storage_unref', rid)
+        t.assert_str_contains(err.message,
+                              'Can not delete a storage ref: no ref')
+        ilt.assert_equals(res, nil)
+    end)
+end
+
+test_group.test_absent_buckets = function(g)
+    g.replica_1_a:exec(function()
+        local res, err = ivshard.storage._call(
+            'storage_absent_buckets',
+            {_G.get_first_bucket()}
+        )
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {})
+    end)
+
+    g.replica_1_a:exec(function()
+        local bid_extra = 3001
+        local res, err = ivshard.storage._call(
+            'storage_absent_buckets',
+            {_G.get_first_bucket(), bid_extra}
+        )
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {bid_extra})
+    end)
+end
diff --git a/test/upgrade/upgrade.result b/test/upgrade/upgrade.result
index 5f4f0582..c5b99ba0 100644
--- a/test/upgrade/upgrade.result
+++ b/test/upgrade/upgrade.result
@@ -177,8 +177,10 @@ vshard.storage._call('test_api', 1, 2, 3)
  |   - rebalancer_apply_routes
  |   - rebalancer_request_state
  |   - recovery_bucket_stat
+ |   - storage_absent_buckets
  |   - storage_map
  |   - storage_ref
+ |   - storage_ref_with_lookup
  |   - storage_unref
  |   - test_api
  | - 1
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index f6809a0d..2fbfba27 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -3180,6 +3180,70 @@ local function storage_ref(rid, timeout)
     return bucket_count()
 end
 
+--
+-- Lookup for absent active buckets.
+--
+-- @param bucket_ids List of bucket identifiers.
+-- @return A dictionary with a list of bucket identifiers which are
+--         not present on the current instance.
+--
+local function storage_absent_buckets(bucket_ids)
+    local status = consts.BUCKET
+    local moved_buckets = {}
+    for _, bucket_id in pairs(bucket_ids) do
+        local bucket = box.space._bucket:get{bucket_id}
+        if not bucket or bucket.status ~= status.ACTIVE then
+            table.insert(moved_buckets, bucket_id)
+        end
+    end
+    return { moved = moved_buckets }
+end
+
+--
+-- Bind a new storage ref to the current box session and check
+-- that all the buckets are present. Is used as a part of the
+-- partial Map-Reduce API.
+--
+-- @param rid Unique reference ID.
+-- @param timeout Seconds.
+-- @param bucket_ids List of bucket identifiers.
+-- @return A dictionary with reference id and a list of bucket identifiers
+--         which are not present on the current instance. If all the buckets
+--         are absent, the reference is not created and a nil reference id
+--         with the list of absent buckets is returned.
+--
+local function storage_ref_with_lookup(rid, timeout, bucket_ids)
+    local moved = storage_absent_buckets(bucket_ids).moved
+    if #moved == #bucket_ids then
+        -- Take an advantage that moved buckets are returned in the same
+        -- order as in the input list.
+        local do_match = true
+        local next_moved = next(moved)
+        local next_passed = next(bucket_ids)
+        ::continue::
+        if next_moved then
+            if next_moved == next_passed then
+                next_moved = next(moved, next_moved)
+                next_passed = next(bucket_ids, next_passed)
+                goto continue
+            else
+                do_match = false
+            end
+        end
+        if do_match then
+            -- If all the passed buckets are absent, there is no need
+            -- to create a ref.
+            return {rid = nil, moved = moved}
+        end
+    end
+
+    local ok, err = storage_ref(rid, timeout)
+    if not ok then
+        return nil, err
+    end
+    return {rid = rid, moved = moved}
+end
+
 --
 -- Drop a storage ref from the current box session. Is used as a part of
 -- Map-Reduce API.
@@ -3234,7 +3298,9 @@ service_call_api = setmetatable({
     rebalancer_apply_routes = rebalancer_apply_routes,
     rebalancer_request_state = rebalancer_request_state,
     recovery_bucket_stat = recovery_bucket_stat,
+    storage_absent_buckets = storage_absent_buckets,
     storage_ref = storage_ref,
+    storage_ref_with_lookup = storage_ref_with_lookup,
     storage_unref = storage_unref,
     storage_map = storage_map,
     info = storage_service_info,

From 87aefc7a39814395e79716d9f9880db82f8cc533 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Wed, 18 Sep 2024 19:43:52 +0200
Subject: [PATCH 06/17] Make the tests pass from prev commit

The previous commit was failing some tests. Lets patch them up.
That commit isn't amended so as to keep its original shape in
respect to the external contributor.

NO_DOC=bugfix
---
 test/instances/storage.lua              | 3 +--
 test/storage-luatest/storage_1_test.lua | 7 ++++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/instances/storage.lua b/test/instances/storage.lua
index 95cd4a7a..964da042 100755
--- a/test/instances/storage.lua
+++ b/test/instances/storage.lua
@@ -13,7 +13,6 @@ _G.ivconst = require('vshard.consts')
 _G.ivutil = require('vshard.util')
 _G.iverror = require('vshard.error')
 _G.ivtest = require('test.luatest_helpers.vtest')
-_G.itable_new = require('table.new')
 
 _G.iwait_timeout = _G.ivtest.wait_timeout
 
@@ -75,7 +74,7 @@ local function get_n_buckets(n)
         error('Invalid number of buckets')
     end
     local index = box.space._bucket.index.status
-    local ids = _G.itable_new(0, n)
+    local ids = table.new(0, n)
     for _, tuple in index:pairs(vconst.BUCKET.ACTIVE) do
         table.insert(ids, tuple.id)
         if #ids == n then
diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua
index 2e27f811..9e760ebe 100644
--- a/test/storage-luatest/storage_1_test.lua
+++ b/test/storage-luatest/storage_1_test.lua
@@ -282,7 +282,8 @@ test_group.test_ref_with_lookup = function(g)
         )
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {rid = rid, moved = {bid_extra}})
-        ivshard.storage._call('storage_unref', rid)
+        _, err = ivshard.storage._call('storage_unref', rid)
+        ilt.assert_equals(err, nil)
 
         -- Check that we do not create a reference if there are no buckets.
         res, err = ivshard.storage._call(
@@ -326,7 +327,7 @@ test_group.test_absent_buckets = function(g)
             {_G.get_first_bucket()}
         )
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {})
+        ilt.assert_equals(res, {moved = {}})
     end)
 
     g.replica_1_a:exec(function()
@@ -336,6 +337,6 @@ test_group.test_absent_buckets = function(g)
             {_G.get_first_bucket(), bid_extra}
         )
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {bid_extra})
+        ilt.assert_equals(res, {moved = {bid_extra}})
     end)
 end

From 5ab5b4c7eb6a577e758ece7524981a6d289466a6 Mon Sep 17 00:00:00 2001
From: Denis Smirnov <sd@picodata.io>
Date: Fri, 24 Nov 2023 12:15:36 +0700
Subject: [PATCH 07/17] router: implement partial map-reduce API

Introduce a partial ref-map-reduce API for vshard. It guarantees
that in case of success the function is executed exactly once on
the storages, that contain the given list of buckets.

NO_DOC=later
---
 test/luatest_helpers/vtest.lua        |  10 +
 test/router-luatest/map_part_test.lua | 300 ++++++++++++++++++++++++++
 test/router-luatest/router_test.lua   |  19 ++
 vshard/router/init.lua                | 235 ++++++++++++++++++++
 4 files changed, 564 insertions(+)
 create mode 100644 test/router-luatest/map_part_test.lua

diff --git a/test/luatest_helpers/vtest.lua b/test/luatest_helpers/vtest.lua
index 1a751532..5350140d 100644
--- a/test/luatest_helpers/vtest.lua
+++ b/test/luatest_helpers/vtest.lua
@@ -466,6 +466,15 @@ local function storage_first_bucket(storage)
     end)
 end
 
+--
+-- Get n active buckets from the storage.
+--
+local function storage_get_n_buckets(storage, n)
+    return storage:exec(function(n)
+        return _G.get_n_buckets(n)
+    end, {n})
+end
+
 --
 -- Disable rebalancer on all storages.
 --
@@ -854,6 +863,7 @@ return {
     cluster_wait_fullsync = cluster_wait_fullsync,
     cluster_rebalancer_find = cluster_rebalancer_find,
     storage_first_bucket = storage_first_bucket,
+    storage_get_n_buckets = storage_get_n_buckets,
     storage_stop = storage_stop,
     storage_start = storage_start,
     router_new = router_new,
diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua
new file mode 100644
index 00000000..8a9dc3d3
--- /dev/null
+++ b/test/router-luatest/map_part_test.lua
@@ -0,0 +1,300 @@
+
+local t = require('luatest')
+local vtest = require('test.luatest_helpers.vtest')
+
+local g = t.group('router')
+local cfg_template = {
+    sharding = {
+        {
+            replicas = {
+                replica_1_a = {
+                    master = true,
+                },
+                replica_1_b = {},
+            },
+        },
+        {
+            replicas = {
+                replica_2_a = {
+                    master = true,
+                },
+                replica_2_b = {},
+            },
+        },
+    },
+    bucket_count = 100,
+    test_user_grant_range = 'super',
+}
+local global_cfg = vtest.config_new(cfg_template)
+
+g.before_all(function()
+    vtest.cluster_new(g, global_cfg)
+
+    t.assert_equals(g.replica_1_a:exec(function()
+        return #ivshard.storage.info().alerts
+    end), 0, 'no alerts after boot')
+
+    local router = vtest.router_new(g, 'router', global_cfg)
+    g.router = router
+    local res, err = router:exec(function()
+        return ivshard.router.bootstrap({timeout = iwait_timeout})
+    end)
+    t.assert(res and not err, 'bootstrap buckets')
+end)
+
+g.after_all(function()
+    g.cluster:drop()
+end)
+
+local function map_part_init()
+    local rs1_uuid = g.replica_1_a:replicaset_uuid()
+    local rs2_uuid = g.replica_2_a:replicaset_uuid()
+
+    local create_map_func_f = function(res1)
+        rawset(_G, 'do_map', function(res2)
+            return {res1, res2}
+        end)
+    end
+    g.replica_1_a:exec(create_map_func_f, {1})
+    g.replica_2_a:exec(create_map_func_f, {2})
+
+    local bids1 = vtest.storage_get_n_buckets(g.replica_1_a, 4)
+    local bids2 = vtest.storage_get_n_buckets(g.replica_2_a, 1)
+
+    return {
+        rs1_uuid = rs1_uuid,
+        rs2_uuid = rs2_uuid,
+        bids1 = bids1,
+        bids2 = bids2,
+    }
+end
+
+g.test_map_part_single_rs = function(g)
+    local expected, res
+    local init = map_part_init()
+
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1, bid2},
+            'do_map',
+            {3},
+            {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[3], init.bids1[2]})
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    expected = {
+        [init.rs1_uuid] = {{1, 3}},
+    }
+    t.assert_equals(res.val, expected)
+end
+
+g.test_map_part_multi_rs = function(g)
+    local expected, res
+    local init = map_part_init()
+
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[1], init.bids2[1]})
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    expected = {
+        [init.rs1_uuid] = {{1, 42}},
+        [init.rs2_uuid] = {{2, 42}},
+    }
+    t.assert_equals(res.val, expected)
+end
+
+g.test_map_part_ref = function(g)
+    local expected, res
+    local init = map_part_init()
+
+    -- First move some buckets from rs1 to rs2 and then pause gc on rs1.
+    -- As a result, the buckets will be in the SENT state on rs1 and
+    -- in the ACTIVE state on rs2.
+    g.replica_1_a:exec(function(bid1, bid2, to)
+        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true
+        ivshard.storage.bucket_send(bid1, to)
+        ivshard.storage.bucket_send(bid2, to)
+    end, {init.bids1[1], init.bids1[2], init.rs2_uuid})
+    -- The buckets are ACTIVE on rs2, so the partial map should succeed.
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[1], init.bids1[2]})
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    expected = {
+        [init.rs2_uuid] = {{2, 42}},
+    }
+    t.assert_equals(res.val, expected)
+    -- But if we use some active bucket from rs1, the partial map should fail.
+    -- The reason is that the moved buckets are still in the SENT state and
+    -- we can't take a ref.
+    res = g.router:exec(function(bid1)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1}, 'do_map', {42}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[3]})
+    t.assert_equals(res.val, nil)
+    t.assert(res.err)
+    t.assert_equals(res.err_uuid, init.rs1_uuid)
+    -- The moved buckets still exist on the rs1 with non-active status.
+    -- Let's remove them and re-enable gc on rs1.
+    g.replica_1_a:exec(function()
+        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false
+        _G.bucket_gc_wait()
+    end)
+    -- Now move the buckets back to rs1 and pause gc on rs2.
+    -- The buckets will be ACTIVE on rs1 and SENT on rs2,
+    -- so the partial map should succeed.
+    res = g.replica_2_a:exec(function(bid1, bid2, to)
+        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true
+        ivshard.storage.bucket_send(bid1, to)
+        local res, err = ivshard.storage.bucket_send(bid2, to)
+        return {
+            res = res,
+            err = err,
+        }
+    end, {init.bids1[1], init.bids1[2], init.rs1_uuid})
+    t.assert_equals(res.err, nil)
+
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[1], init.bids1[2]})
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    expected = {
+        [init.rs1_uuid] = {{1, 42}},
+    }
+    t.assert_equals(res.val, expected)
+    -- Re-enable gc on rs2.
+    g.replica_2_a:exec(function()
+        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false
+        _G.bucket_gc_wait()
+    end)
+end
+
+g.test_map_part_double_ref = function(g)
+    local expected, res
+    local init = map_part_init()
+
+    -- First, disable discovery on the router to disable route cache update.
+    g.router:exec(function()
+        ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true
+    end)
+    -- Then, move the bucket form rs1 to rs2. Now the router has an outdated
+    -- route cache.
+    g.replica_1_a:exec(function(bid, to)
+        ivshard.storage.bucket_send(bid, to)
+    end, {init.bids1[4], init.rs2_uuid})
+    -- Call a partial map for the moved bucket and some bucket from rs2. The ref
+    -- stage should be done in two steps:
+    -- 1. ref rs2 and returns the moved bucket;
+    -- 2. discover the moved bucket on rs2 and avoid double reference;
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[4], init.bids2[1]})
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    expected = {
+        [init.rs2_uuid] = {{2, 42}},
+    }
+    t.assert_equals(res.val, expected)
+    -- Call a partial map one more time to make sure there are no references
+    -- left.
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[4], init.bids2[1]})
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    t.assert_equals(res.val, expected)
+    -- Return the bucket back and re-enable discovery on the router.
+    g.replica_2_a:exec(function(bid, to)
+        ivshard.storage.bucket_send(bid, to)
+    end, {init.bids1[4], init.rs1_uuid})
+    g.router:exec(function()
+        ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false
+    end)
+end
+
+g.test_map_part_map = function(g)
+    local res
+    local init = map_part_init()
+
+    g.replica_2_a:exec(function()
+        _G.do_map = function()
+            return box.error(box.error.PROC_LUA, "map_err")
+        end
+    end)
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid2, bid1}, 'do_map', {3}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[1], init.bids2[1]})
+    t.assert_equals(res.val, nil)
+    t.assert_covers(res.err, {
+        code = box.error.PROC_LUA,
+        type = 'ClientError',
+        message = 'map_err'
+    })
+    t.assert_equals(res.err_uuid, init.rs2_uuid)
+    -- Check that there is no dangling references after the error.
+    init = map_part_init()
+    res = g.router:exec(function(bid1, bid2)
+        local val, err, err_uuid = ivshard.router.map_part_callrw(
+            {bid1, bid2}, 'do_map', {3}, {timeout = iwait_timeout})
+        return {
+            val = val,
+            err = err,
+            err_uuid = err_uuid,
+        }
+    end, {init.bids1[1], init.bids2[1]})
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    t.assert_equals(res.val, {
+        [init.rs1_uuid] = {{1, 3}},
+        [init.rs2_uuid] = {{2, 3}},
+    })
+end
diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua
index b2e12951..0938b600 100644
--- a/test/router-luatest/router_test.lua
+++ b/test/router-luatest/router_test.lua
@@ -286,6 +286,25 @@ g.test_map_callrw_raw = function(g)
     end)
 end
 
+g.test_group_buckets = function(g)
+    local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2)
+
+    local res = g.router:exec(function(bid1, bid2)
+        local val, err = ivshard.router.group({bid2, bid1, bid1})
+        return {
+            val = val,
+            err = err,
+        }
+    end, {bids[1], bids[2]})
+    assert(res.err == nil)
+    local rs1_uuid = g.replica_1_a:replicaset_uuid()
+    local expected = {
+        [rs1_uuid] = {bids[1], bids[2]},
+    }
+    table.sort(expected[rs1_uuid])
+    t.assert_equals(res.val, expected)
+end
+
 g.test_uri_compare_and_reuse = function(g)
     -- Multilisten itself is not used, but URI-table is supported since the same
     -- version.
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index f16fb69e..97169d38 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -223,6 +223,49 @@ local function bucket_resolve(router, bucket_id)
     return replicaset
 end
 
+-- Group bucket ids by replicasets according to the router cache.
+local function buckets_group(router, bucket_ids, timeout)
+    timeout = timeout or consts.CALL_TIMEOUT_MIN
+    local deadline = fiber_clock() + timeout
+    local replicaset, err
+    local replicaset_buckets = {}
+    local prev_id = nil
+    local buckets, id
+
+    -- Sort buckets to skip duplicates.
+    table.sort(bucket_ids)
+    for _, bucket_id in pairs(bucket_ids) do
+        if bucket_id == prev_id then
+            goto continue
+        end
+
+        if fiber_clock() > deadline then
+            return nil, lerror.timeout()
+        end
+
+        replicaset, err = bucket_resolve(router, bucket_id)
+        if err ~= nil then
+            return nil, err
+        end
+        if replicaset == nil then
+            return nil, lerror.vshard(lerror.code.NO_ROUTE_TO_BUCKET, bucket_id)
+        end
+
+        id = replicaset.id
+        buckets = replicaset_buckets[id]
+        if buckets then
+            table.insert(buckets, bucket_id)
+        else
+            replicaset_buckets[id] = {bucket_id}
+        end
+
+        ::continue::
+        prev_id = bucket_id
+    end
+
+    return replicaset_buckets
+end
+
 --
 -- Arrange downloaded buckets to the route map so as they
 -- reference a given replicaset.
@@ -972,6 +1015,197 @@ local function router_map_callrw(router, func, args, opts)
     return nil, err, err_id
 end
 
+--
+-- Partial Ref-Map-Reduce. The given function is called on the masters
+-- with a guarantee that in case of success it was executed exactly once
+-- on all storages that contain the given buckets. Execution on masters
+-- is chosen to allow buckets be accessed for reads and writes during the
+-- call.
+--
+-- The execution consists of the following stages:
+--
+-- Ref stage. We ref all the storages with the given buckets. It is
+-- an iterative process, because the router's cache may be outdated,
+-- and we need to discover the moved buckets. At the end of this stage
+-- we have refed all the storages containing the given buckets with the
+-- given timeout.
+--
+-- Map stage. We call the given function on all the storages we have
+-- refed. On the storage side we use a `storage_map` function, which
+-- switches the timeout ref to the manual mode and unrefs the storage
+-- after the function is called.
+--
+-- Reduce stage. We collect the results of the function call.
+--
+-- @param router Instance to use.
+-- @param bucket_ids List of bucket identifiers.
+-- @param func Name of the function to call.
+-- @param args Function arguments passed in netbox style (as an array).
+-- @param opts Can only contain 'timeout' as a number of seconds. Note that the
+--     refs may end up being kept on the storages during this entire timeout if
+--     something goes wrong. For instance, network issues appear. This means
+--     better not use a value bigger than necessary. A stuck infinite ref can
+--     only be dropped by this router restart/reconnect or the storage restart.
+--
+-- @return In case of success - a map with replicaset UUID keys and values being
+--     what the function returned from the replicaset.
+--
+-- @return In case of an error - nil, error object, optional UUID of the
+--     replicaset where the error happened. UUID may be not present if it wasn't
+--     about concrete replicaset. For example, not all buckets were found even
+--     though all replicasets were scanned.
+--
+local function router_map_part_callrw(router, bucket_ids, func, args, opts)
+    local replicaset
+    local grouped_buckets
+    local err, err_id, res, ok, map
+    local call_args
+    local replicasets = {}
+    local preallocated = false
+    local futures = {}
+    local call_opts = {is_async = true}
+    local rs_count = 0
+    local timeout = opts and opts.timeout or consts.CALL_TIMEOUT_MIN
+    local deadline = fiber_clock() + timeout
+    local rid = M.ref_id
+    M.ref_id = rid + 1
+
+    -- Nil checks are done explicitly here (== nil instead of 'not'), because
+    -- netbox requests return box.NULL instead of nils.
+
+    -- Ref stage.
+    while #bucket_ids > 0 do
+        -- Group the buckets by replicasets according to the router cache.
+        grouped_buckets, err = buckets_group(router, bucket_ids, timeout)
+        if grouped_buckets == nil then
+            goto fail
+        end
+        timeout = deadline - fiber_clock()
+
+        -- Send ref requests with timeouts to the replicasets.
+        futures = table_new(0, #grouped_buckets)
+        for id, buckets in pairs(grouped_buckets) do
+            replicaset = router.replicasets[id]
+            -- Netbox async requests work only with active connections. Need to
+            -- wait for the connection explicitly.
+            timeout, err = replicaset:wait_connected(timeout)
+            if timeout == nil then
+                err_id = id
+                goto fail
+            end
+            if replicasets[id] then
+                -- Replicaset is already referred on the previous iteration.
+                -- Simply get the moved buckets without double referencing.
+                call_args = {'storage_absent_buckets', buckets}
+            else
+                call_args = {'storage_ref_with_lookup', rid, timeout, buckets}
+                rs_count = rs_count + 1
+            end
+            res, err = replicaset:callrw('vshard.storage._call', call_args,
+                                         call_opts)
+            if res == nil then
+                err_id = id
+                goto fail
+            end
+            futures[id] = res
+        end
+
+        if not preallocated then
+            -- Current preallocation works only for the first iteration
+            -- (i.e. router cache is not outdated).
+            replicasets = table_new(0, rs_count)
+            preallocated = true
+        end
+
+        -- Wait for the refs to be done and collect moved buckets.
+        bucket_ids = {}
+        for id, future in pairs(futures) do
+            res, err = future_wait(future, timeout)
+            -- Handle netbox error first.
+            if res == nil then
+                err_id = id
+                goto fail
+            end
+            -- Ref returns nil,err or {rid, moved}.
+            res, err = res[1], res[2]
+            if res == nil then
+                err_id = id
+                goto fail
+            end
+            for _, bucket_id in pairs(res.moved) do
+                bucket_reset(router, bucket_id)
+                table.insert(bucket_ids, bucket_id)
+            end
+            if res.rid then
+                -- If there are no buckets on the replicaset,
+                -- it would not be referred.
+                replicasets[id] = router.replicasets[id]
+            end
+            timeout = deadline - fiber_clock()
+        end
+    end
+
+    -- Map stage.
+    map = table_new(0, rs_count)
+    futures = table_new(0, rs_count)
+    args = {'storage_map', rid, func, args}
+    -- Send map requests.
+    for id, rs in pairs(replicasets) do
+        res, err = rs:callrw('vshard.storage._call', args, call_opts)
+        if res == nil then
+            err_id = id
+            goto fail
+        end
+        futures[id] = res
+    end
+
+    -- Reduce stage.
+    -- Collect map responses (refs were deleted by the storages for non-error
+    -- results).
+    for id, future in pairs(futures) do
+        res, err = future_wait(future, timeout)
+        if res == nil then
+            err_id = id
+            goto fail
+        end
+        -- Map returns true,res or nil,err.
+        ok, res = res[1], res[2]
+        if ok == nil then
+            err = res
+            err_id = id
+            goto fail
+        end
+        if res ~= nil then
+            -- Store as a table so in future it could be extended for
+            -- multireturn.
+            map[id] = {res}
+        end
+        timeout = deadline - fiber_clock()
+    end
+    do return map end
+
+::fail::
+    local f
+    for id, future in pairs(futures) do
+        future:discard()
+        -- Best effort to remove the created refs before exiting. Can help if
+        -- the timeout was big and the error happened early.
+        call_args = {'storage_unref', rid}
+        replicaset = router.replicasets[id]
+        if replicaset then
+            f = replicaset:callrw('vshard.storage._call', call_args, call_opts)
+            if f ~= nil then
+                -- Don't care waiting for a result - no time for this. But it
+                -- won't affect the request sending if the connection is still
+                -- alive.
+                f:discard()
+            end
+        end
+    end
+    err = lerror.make(err)
+    return nil, err, err_id
+end
+
 --
 -- Get replicaset object by bucket identifier.
 -- @param bucket_id Bucket identifier.
@@ -1817,6 +2051,7 @@ local router_mt = {
         callre = router_make_api(router_callre),
         callbre = router_make_api(router_callbre),
         map_callrw = router_make_api(router_map_callrw),
+        map_part_callrw = router_make_api(router_map_part_callrw),
         route = router_make_api(router_route),
         routeall = router_make_api(router_routeall),
         bucket_id = router_make_api(router_bucket_id),

From 8c9529395cf9005328f65a75f37a2955450e4749 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Wed, 18 Sep 2024 19:57:02 +0200
Subject: [PATCH 08/17] Make the tests pass from prev commit

The previous commit was failing some tests. Lets patch them up.
That commit isn't amended so as to keep its original shape in
respect to the external contributor.

NO_DOC=bugfix
---
 test/router-luatest/router_test.lua | 2 +-
 vshard/router/init.lua              | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua
index 0938b600..f9cd6920 100644
--- a/test/router-luatest/router_test.lua
+++ b/test/router-luatest/router_test.lua
@@ -290,7 +290,7 @@ g.test_group_buckets = function(g)
     local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2)
 
     local res = g.router:exec(function(bid1, bid2)
-        local val, err = ivshard.router.group({bid2, bid1, bid1})
+        local val, err = ivshard.router._buckets_group({bid2, bid1, bid1})
         return {
             val = val,
             err = err,
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 97169d38..ea4c5fd2 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -2066,6 +2066,7 @@ local router_mt = {
         discovery_set = router_make_api(discovery_set),
         _route_map_clear = router_make_api(route_map_clear),
         _bucket_reset = router_make_api(bucket_reset),
+        _buckets_group = router_make_api(buckets_group),
         disable = router_disable,
         enable = router_enable,
     }

From ff8e8a0f1f58033445e0d2ae41b02435d08b6a2f Mon Sep 17 00:00:00 2001
From: Denis Smirnov <sd@picodata.io>
Date: Thu, 30 Nov 2023 12:17:00 +0700
Subject: [PATCH 09/17] router: improve master connection parallelism in
 map-reduce

This is useful for RW map-reduce requests which need to send
multiple network requests in parallel to multiple masters.

In-parallel means using is_async netbox feature. But it only
works if the connection is already established.

Which means that the connection establishment ideally must also
be parallel.

NO_DOC=internal
NO_TEST=already covered
---
 vshard/replicaset.lua  |  1 +
 vshard/router/init.lua | 61 +++++++++++++++++++++++++++++++++++++-----
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index abdffe67..086ed4ec 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -1225,6 +1225,7 @@ local replicaset_mt = {
         up_replica_priority = replicaset_up_replica_priority;
         wait_connected = replicaset_wait_connected,
         wait_connected_all = replicaset_wait_connected_all,
+        wait_master = replicaset_wait_master,
         call = replicaset_master_call;
         callrw = replicaset_master_call;
         callro = replicaset_template_multicallro(false, false);
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index ea4c5fd2..48bd9f4e 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -804,6 +804,37 @@ local function router_call(router, bucket_id, opts, ...)
                             ...)
 end
 
+local function wait_connected_to_masters(replicasets, timeout)
+    local master, id
+    local err, err_id
+
+    -- Start connecting to all masters in parallel.
+    local deadline = fiber_clock() + timeout
+    for _, replicaset in pairs(replicasets) do
+        master, err = replicaset:wait_master(timeout)
+        id = replicaset.id
+        if not master then
+            err_id = id
+            goto fail
+        end
+        replicaset:connect_replica(master)
+        timeout = deadline - fiber_clock()
+    end
+
+    -- Wait until all connections are established.
+    for _, replicaset in pairs(replicasets) do
+        timeout, err = replicaset:wait_connected(timeout)
+        if not timeout then
+            err_id = replicaset.id
+            goto fail
+        end
+    end
+    do return timeout end
+
+    ::fail::
+    return nil, err, err_id
+end
+
 --
 -- Perform map-reduce stages on the given set of replicasets. The map expects a
 -- valid ref ID which must be done beforehand.
@@ -956,14 +987,17 @@ local function router_map_callrw(router, func, args, opts)
     --
     -- Ref stage: send.
     --
+    -- Netbox async requests work only with active connections. Need to wait
+    -- for the connection explicitly.
+    local rs_list = table_new(0, #replicasets)
+    for _, rs in pairs(replicasets) do
+        table.insert(rs_list, rs)
+    end
+    timeout, err, err_id = wait_connected_to_masters(rs_list, timeout)
+    if not timeout then
+        goto fail
+    end
     for id, rs in pairs(replicasets) do
-        -- Netbox async requests work only with active connections. Need to wait
-        -- for the connection explicitly.
-        timeout, err = rs:wait_connected(timeout)
-        if timeout == nil then
-            err_id = id
-            goto fail
-        end
         res, err = rs:callrw('vshard.storage._call',
                               {'storage_ref', rid, timeout}, opts_async)
         if res == nil then
@@ -1060,6 +1094,7 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts)
     local grouped_buckets
     local err, err_id, res, ok, map
     local call_args
+    local rs_list
     local replicasets = {}
     local preallocated = false
     local futures = {}
@@ -1082,6 +1117,18 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts)
         end
         timeout = deadline - fiber_clock()
 
+        -- Netbox async requests work only with active connections.
+        -- So, first need to wait for the master connection explicitly.
+        rs_list = table_new(0, #grouped_buckets)
+        for uuid, _ in pairs(grouped_buckets) do
+            replicaset = router.replicasets[uuid]
+            table.insert(rs_list, replicaset)
+        end
+        timeout, err, err_id = wait_connected_to_masters(rs_list, timeout)
+        if not timeout then
+            goto fail
+        end
+
         -- Send ref requests with timeouts to the replicasets.
         futures = table_new(0, #grouped_buckets)
         for id, buckets in pairs(grouped_buckets) do

From 7ff139a0c9685bd0de9f483c6b6664900531ae45 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Wed, 18 Sep 2024 20:05:42 +0200
Subject: [PATCH 10/17] Review fixes for Partial Map-Reduce

There were a number of minor issues with the previous several
commits, like the tests running way too long or some cases not
being covered or the code being non-critically suboptimal.

Lets fix them all. The original commits aren't amended so as to
keep their original shape in respect to the external contributor.

NO_DOC=bugfix
---
 test/router-luatest/map_part_test.lua   | 489 ++++++++++++++----------
 test/router-luatest/router_test.lua     |  17 +-
 test/router/router.result               |   1 +
 test/storage-luatest/storage_1_test.lua | 115 +++---
 test/upgrade/upgrade.result             |   4 +-
 vshard/replicaset.lua                   |  30 +-
 vshard/router/init.lua                  | 193 +++-------
 vshard/storage/init.lua                 |  34 +-
 8 files changed, 429 insertions(+), 454 deletions(-)

diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua
index 8a9dc3d3..1a606b2c 100644
--- a/test/router-luatest/map_part_test.lua
+++ b/test/router-luatest/map_part_test.lua
@@ -1,6 +1,7 @@
-
+local fiber = require('fiber')
 local t = require('luatest')
 local vtest = require('test.luatest_helpers.vtest')
+local vutil = require('vshard.util')
 
 local g = t.group('router')
 local cfg_template = {
@@ -22,279 +23,369 @@ local cfg_template = {
             },
         },
     },
-    bucket_count = 100,
+    bucket_count = 10,
     test_user_grant_range = 'super',
 }
 local global_cfg = vtest.config_new(cfg_template)
 
-g.before_all(function()
-    vtest.cluster_new(g, global_cfg)
-
-    t.assert_equals(g.replica_1_a:exec(function()
+g.before_all(function(cg)
+    vtest.cluster_new(cg, global_cfg)
+    t.assert_equals(cg.replica_1_a:exec(function()
         return #ivshard.storage.info().alerts
     end), 0, 'no alerts after boot')
-
-    local router = vtest.router_new(g, 'router', global_cfg)
-    g.router = router
+    local _
+    local router = vtest.router_new(cg, 'router', global_cfg)
+    cg.router = router
     local res, err = router:exec(function()
-        return ivshard.router.bootstrap({timeout = iwait_timeout})
+        local res, err = ivshard.router.bootstrap({timeout = iwait_timeout})
+        rawset(_G, 'do_map', function(bids, args, opts)
+            local old_bids = table.copy(bids)
+            local val, err, err_uuid = ivshard.router.map_part_callrw(
+                bids, 'do_map', args, opts)
+            -- Make sure the bucket list isn't changed by vshard.
+            ilt.assert_equals(bids, old_bids)
+            local val_type
+            if opts.return_raw and val ~= nil then
+                -- Src+value. The src is plain Lua data. The value is raw.
+                local _, one_map = next(val)
+                val_type = type(one_map)
+            else
+                val_type = type(val)
+            end
+            return {
+                val = val,
+                val_type = val_type,
+                err = err,
+                err_uuid = err_uuid,
+            }
+        end)
+        return res, err
     end)
     t.assert(res and not err, 'bootstrap buckets')
+    _, err = vtest.cluster_exec_each(cg, function()
+        rawset(_G, 'do_map', function(res)
+            ilt.assert_gt(require('vshard.storage.ref').count, 0)
+            return {ivutil.replicaset_uuid(), res}
+        end)
+        rawset(_G, 'bucket_send', function(bid, dst)
+            local _, err = ivshard.storage.bucket_send(
+                bid, dst, {timeout = iwait_timeout})
+            ilt.assert_equals(err, nil)
+        end)
+    end)
+    t.assert_equals(err, nil)
+    cg.rs1_uuid = cg.replica_1_a:replicaset_uuid()
+    cg.rs2_uuid = cg.replica_2_a:replicaset_uuid()
 end)
 
-g.after_all(function()
-    g.cluster:drop()
+g.after_all(function(cg)
+    cg.cluster:drop()
 end)
 
-local function map_part_init()
-    local rs1_uuid = g.replica_1_a:replicaset_uuid()
-    local rs2_uuid = g.replica_2_a:replicaset_uuid()
-
-    local create_map_func_f = function(res1)
-        rawset(_G, 'do_map', function(res2)
-            return {res1, res2}
-        end)
-    end
-    g.replica_1_a:exec(create_map_func_f, {1})
-    g.replica_2_a:exec(create_map_func_f, {2})
-
-    local bids1 = vtest.storage_get_n_buckets(g.replica_1_a, 4)
-    local bids2 = vtest.storage_get_n_buckets(g.replica_2_a, 1)
-
-    return {
-        rs1_uuid = rs1_uuid,
-        rs2_uuid = rs2_uuid,
-        bids1 = bids1,
-        bids2 = bids2,
-    }
+local function router_do_map(router, bids, args, opts)
+    return router:exec(function(bids, args, opts)
+        return _G.do_map(bids, args, opts)
+    end, {bids, args, opts})
 end
 
-g.test_map_part_single_rs = function(g)
-    local expected, res
-    local init = map_part_init()
-
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1, bid2},
-            'do_map',
-            {3},
-            {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[3], init.bids1[2]})
+g.test_map_part_single_rs = function(cg)
+    local bids = vtest.storage_get_n_buckets(cg.replica_1_a, 4)
+    local res = router_do_map(cg.router, {bids[3], bids[2]},
+                              {123}, {timeout = vtest.wait_timeout})
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
-    expected = {
-        [init.rs1_uuid] = {{1, 3}},
-    }
-    t.assert_equals(res.val, expected)
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid, 123}},
+    })
 end
 
-g.test_map_part_multi_rs = function(g)
-    local expected, res
-    local init = map_part_init()
-
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[1], init.bids2[1]})
+g.test_map_part_multi_rs = function(cg)
+    local bid1 = vtest.storage_first_bucket(cg.replica_1_a)
+    local bid2 = vtest.storage_first_bucket(cg.replica_2_a)
+    local res = router_do_map(cg.router, {bid1, bid2},
+                              {123}, {timeout = vtest.wait_timeout})
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
-    expected = {
-        [init.rs1_uuid] = {{1, 42}},
-        [init.rs2_uuid] = {{2, 42}},
-    }
-    t.assert_equals(res.val, expected)
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid, 123}},
+        [cg.rs2_uuid] = {{cg.rs2_uuid, 123}},
+    })
 end
 
-g.test_map_part_ref = function(g)
-    local expected, res
-    local init = map_part_init()
-
+g.test_map_part_ref = function(cg)
     -- First move some buckets from rs1 to rs2 and then pause gc on rs1.
     -- As a result, the buckets will be in the SENT state on rs1 and
     -- in the ACTIVE state on rs2.
-    g.replica_1_a:exec(function(bid1, bid2, to)
-        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true
-        ivshard.storage.bucket_send(bid1, to)
-        ivshard.storage.bucket_send(bid2, to)
-    end, {init.bids1[1], init.bids1[2], init.rs2_uuid})
+    local bids1 = vtest.storage_get_n_buckets(cg.replica_1_a, 3)
+    cg.replica_1_a:exec(function(bid1, bid2, to)
+        _G.bucket_gc_pause()
+        _G.bucket_send(bid1, to)
+        _G.bucket_send(bid2, to)
+    end, {bids1[1], bids1[2], cg.rs2_uuid})
     -- The buckets are ACTIVE on rs2, so the partial map should succeed.
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[1], init.bids1[2]})
+    local res = router_do_map(cg.router, {bids1[1], bids1[2]},
+                              {42}, {timeout = vtest.wait_timeout})
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
-    expected = {
-        [init.rs2_uuid] = {{2, 42}},
-    }
-    t.assert_equals(res.val, expected)
+    t.assert_equals(res.val, {
+        [cg.rs2_uuid] = {{cg.rs2_uuid, 42}},
+    })
     -- But if we use some active bucket from rs1, the partial map should fail.
     -- The reason is that the moved buckets are still in the SENT state and
     -- we can't take a ref.
-    res = g.router:exec(function(bid1)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1}, 'do_map', {42}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[3]})
+    res = router_do_map(cg.router, {bids1[3]}, {42}, {timeout = 0.1})
     t.assert_equals(res.val, nil)
     t.assert(res.err)
-    t.assert_equals(res.err_uuid, init.rs1_uuid)
+    t.assert_equals(res.err_uuid, cg.rs1_uuid)
     -- The moved buckets still exist on the rs1 with non-active status.
     -- Let's remove them and re-enable gc on rs1.
-    g.replica_1_a:exec(function()
-        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false
+    cg.replica_1_a:exec(function()
+        _G.bucket_gc_continue()
         _G.bucket_gc_wait()
     end)
     -- Now move the buckets back to rs1 and pause gc on rs2.
     -- The buckets will be ACTIVE on rs1 and SENT on rs2,
     -- so the partial map should succeed.
-    res = g.replica_2_a:exec(function(bid1, bid2, to)
-        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = true
-        ivshard.storage.bucket_send(bid1, to)
-        local res, err = ivshard.storage.bucket_send(bid2, to)
-        return {
-            res = res,
-            err = err,
-        }
-    end, {init.bids1[1], init.bids1[2], init.rs1_uuid})
-    t.assert_equals(res.err, nil)
+    cg.replica_2_a:exec(function(bid1, bid2, to)
+        _G.bucket_gc_pause()
+        _G.bucket_send(bid1, to)
+        _G.bucket_send(bid2, to)
+    end, {bids1[1], bids1[2], cg.rs1_uuid})
 
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[1], init.bids1[2]})
+    res = router_do_map(cg.router, {bids1[1], bids1[2]},
+                        {42}, {timeout = vtest.wait_timeout})
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
-    expected = {
-        [init.rs1_uuid] = {{1, 42}},
-    }
-    t.assert_equals(res.val, expected)
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid, 42}},
+    })
     -- Re-enable gc on rs2.
-    g.replica_2_a:exec(function()
-        ivshard.storage.internal.errinj.ERRINJ_BUCKET_GC_PAUSE = false
+    cg.replica_2_a:exec(function()
+        _G.bucket_gc_continue()
         _G.bucket_gc_wait()
     end)
 end
 
-g.test_map_part_double_ref = function(g)
-    local expected, res
-    local init = map_part_init()
-
+g.test_map_part_double_ref = function(cg)
+    local bid1 = vtest.storage_first_bucket(cg.replica_1_a)
+    local bid2 = vtest.storage_first_bucket(cg.replica_2_a)
     -- First, disable discovery on the router to disable route cache update.
-    g.router:exec(function()
+    cg.router:exec(function(bid, uuid)
         ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true
-    end)
+        -- Make sure the location of the bucket is known.
+        local rs, err = ivshard.router.bucket_discovery(bid)
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(rs.uuid, uuid)
+    end, {bid1, cg.rs1_uuid})
     -- Then, move the bucket form rs1 to rs2. Now the router has an outdated
     -- route cache.
-    g.replica_1_a:exec(function(bid, to)
-        ivshard.storage.bucket_send(bid, to)
-    end, {init.bids1[4], init.rs2_uuid})
-    -- Call a partial map for the moved bucket and some bucket from rs2. The ref
-    -- stage should be done in two steps:
+    cg.replica_1_a:exec(function(bid, to)
+        _G.bucket_send(bid, to)
+        _G.bucket_gc_wait()
+    end, {bid1, cg.rs2_uuid})
+    -- Call a partial map for the moved bucket and some bucket
+    -- from rs2. The ref stage should be done in two steps:
     -- 1. ref rs2 and returns the moved bucket;
     -- 2. discover the moved bucket on rs2 and avoid double reference;
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[4], init.bids2[1]})
+    local res = router_do_map(cg.router, {bid1, bid2},
+                              {42}, {timeout = vtest.wait_timeout})
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
-    expected = {
-        [init.rs2_uuid] = {{2, 42}},
-    }
-    t.assert_equals(res.val, expected)
-    -- Call a partial map one more time to make sure there are no references
-    -- left.
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1, bid2}, 'do_map', {42}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[4], init.bids2[1]})
-    t.assert_equals(res.err, nil)
-    t.assert_equals(res.err_uuid, nil)
-    t.assert_equals(res.val, expected)
+    t.assert_equals(res.val, {
+        [cg.rs2_uuid] = {{cg.rs2_uuid, 42}},
+    })
+    -- Make sure there are no references left.
+    local _, err = vtest.cluster_exec_each(cg, function()
+        ilt.assert_equals(require('vshard.storage.ref').count, 0)
+    end)
+    t.assert_equals(err, nil)
     -- Return the bucket back and re-enable discovery on the router.
-    g.replica_2_a:exec(function(bid, to)
-        ivshard.storage.bucket_send(bid, to)
-    end, {init.bids1[4], init.rs1_uuid})
-    g.router:exec(function()
+    cg.replica_2_a:exec(function(bid, to)
+        _G.bucket_send(bid, to)
+        _G.bucket_gc_wait()
+    end, {bid1, cg.rs1_uuid})
+    cg.router:exec(function()
         ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false
     end)
 end
 
-g.test_map_part_map = function(g)
-    local res
-    local init = map_part_init()
+g.test_map_part_ref_timeout = function(cg)
+    local bids = vtest.storage_get_n_buckets(cg.replica_1_a, 2)
+    local bid1 = bids[1]
+    local bid2 = bids[2]
+
+    bids = vtest.storage_get_n_buckets(cg.replica_2_a, 2)
+    local bid3 = bids[1]
+    local bid4 = bids[2]
+
+    -- First, disable discovery on the router to disable route cache update.
+    cg.router:exec(function(bids)
+        ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = true
+        -- Make sure the location of the bucket is known.
+        for _, bid in pairs(bids) do
+            local _, err = ivshard.router.bucket_discovery(bid)
+            ilt.assert_equals(err, nil)
+        end
+    end, {{bid1, bid2, bid3, bid4}})
+
+    -- Send bucket so the router thinks:
+    --     rs1: {b1, b2}, rs2: {b3, b4}
+    -- and actually the state is:
+    --     rs1: {b1},     rs2: {b2, b3, b4}
+    cg.replica_1_a:exec(function(bid, to)
+        _G.bucket_send(bid, to)
+        _G.bucket_gc_wait()
+    end, {bid2, cg.rs2_uuid})
+
+    -- Partial map goes with the outdated mapping to the storages, successfully
+    -- refs rs1. Then gets a bit stuck in rs2. Rs1 ref in the meantime time is
+    -- lost. Due to restart or timeout or whatever.
+    cg.replica_2_a:exec(function()
+        local lref = require('vshard.storage.ref')
+        rawset(_G, 'old_ref_add', lref.add)
+        lref.add = function(rid, sid, ...)
+            ilt.assert_equals(rawget(_G, 'test_ref'), nil)
+            rawset(_G, 'test_ref', {rid = rid, sid = sid})
+            local ok, err = _G.old_ref_add(rid, sid, ...)
+            ilt.helpers.retrying({timeout = iwait_timeout}, function()
+                if rawget(_G, 'test_ref') then
+                    error('Test refs is not picked up')
+                end
+            end)
+            return ok, err
+        end
+    end)
+    local f = fiber.new(function()
+        return router_do_map(cg.router, {bid1, bid2, bid3, bid4},
+                             {42}, {timeout = vtest.wait_timeout})
+    end)
+    f:set_joinable(true)
+    cg.replica_2_a:exec(function()
+        ilt.helpers.retrying({timeout = iwait_timeout}, function()
+            if not rawget(_G, 'test_ref') then
+                error('Test refs is not set')
+            end
+        end)
+        local lref = require('vshard.storage.ref')
+        local _, err = lref.del(_G.test_ref.rid, _G.test_ref.sid)
+        ilt.assert_equals(err, nil)
+        -- Cleanup.
+        lref.add = _G.old_ref_add
+        _G.old_ref_add = nil
+        _G.test_ref = nil
+    end)
+
+    -- The whole request must fail now.
+    local ok, res = f:join()
+    t.assert(ok)
+    t.assert(res)
+    t.assert_not_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, cg.rs2_uuid)
+
+    -- Make sure there are no references left.
+    local _, err = vtest.cluster_exec_each(cg, function()
+        ilt.assert_equals(require('vshard.storage.ref').count, 0)
+    end)
+    t.assert_equals(err, nil)
 
-    g.replica_2_a:exec(function()
+    -- Return the bucket back and re-enable discovery on the router.
+    cg.replica_2_a:exec(function(bid, to)
+        _G.bucket_send(bid, to)
+        _G.bucket_gc_wait()
+    end, {bid2, cg.rs1_uuid})
+    cg.router:exec(function()
+        ivshard.router.internal.errinj.ERRINJ_LONG_DISCOVERY = false
+    end)
+end
+
+g.test_map_part_map = function(cg)
+    local bid1 = vtest.storage_first_bucket(cg.replica_1_a)
+    local bid2 = vtest.storage_first_bucket(cg.replica_2_a)
+    cg.replica_2_a:exec(function()
+        rawset(_G, 'old_do_map', _G.do_map)
         _G.do_map = function()
             return box.error(box.error.PROC_LUA, "map_err")
         end
     end)
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid2, bid1}, 'do_map', {3}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[1], init.bids2[1]})
+    local res = router_do_map(cg.router, {bid1, bid2},
+                              {3}, {timeout = vtest.wait_timeout})
     t.assert_equals(res.val, nil)
     t.assert_covers(res.err, {
         code = box.error.PROC_LUA,
         type = 'ClientError',
         message = 'map_err'
     })
-    t.assert_equals(res.err_uuid, init.rs2_uuid)
+    t.assert_equals(res.err_uuid, cg.rs2_uuid)
     -- Check that there is no dangling references after the error.
-    init = map_part_init()
-    res = g.router:exec(function(bid1, bid2)
-        local val, err, err_uuid = ivshard.router.map_part_callrw(
-            {bid1, bid2}, 'do_map', {3}, {timeout = iwait_timeout})
-        return {
-            val = val,
-            err = err,
-            err_uuid = err_uuid,
-        }
-    end, {init.bids1[1], init.bids2[1]})
-    t.assert_equals(res.err, nil)
+    local _, err = vtest.cluster_exec_each(cg, function()
+        ilt.assert_equals(require('vshard.storage.ref').count, 0)
+    end)
+    t.assert_equals(err, nil)
+    cg.replica_2_a:exec(function()
+        _G.do_map = _G.old_do_map
+        _G.old_do_map = nil
+    end)
+    res = router_do_map(cg.router, {bid1, bid2},
+                        {3}, {timeout = vtest.wait_timeout})
+    t.assert_equals(res.err, nil, res.err)
     t.assert_equals(res.err_uuid, nil)
     t.assert_equals(res.val, {
-        [init.rs1_uuid] = {{1, 3}},
-        [init.rs2_uuid] = {{2, 3}},
+        [cg.rs1_uuid] = {{cg.rs1_uuid, 3}},
+        [cg.rs2_uuid] = {{cg.rs2_uuid, 3}},
     })
 end
+
+g.test_map_part_callrw_raw = function(cg)
+    t.run_only_if(vutil.feature.netbox_return_raw)
+    --
+    -- Successful map.
+    --
+    local bid1 = vtest.storage_first_bucket(cg.replica_1_a)
+    local bid2 = vtest.storage_first_bucket(cg.replica_2_a)
+    local res = router_do_map(cg.router, {bid1, bid2}, {3},
+                              {timeout = vtest.wait_timeout, return_raw = true})
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid, 3}},
+        [cg.rs2_uuid] = {{cg.rs2_uuid, 3}},
+    })
+    t.assert_equals(res.val_type, 'userdata')
+    t.assert(not res.err)
+    --
+    -- Successful map, but one of the storages returns nothing.
+    --
+    cg.replica_2_a:exec(function()
+        rawset(_G, 'old_do_map', _G.do_map)
+        _G.do_map = function()
+            return
+        end
+    end)
+    res = router_do_map(cg.router, {bid1, bid2}, {},
+                        {timeout = vtest.wait_timeout, return_raw = true})
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid}},
+    })
+    --
+    -- Error at map stage.
+    --
+    cg.replica_2_a:exec(function()
+        _G.do_map = function()
+            return box.error(box.error.PROC_LUA, "map_err")
+        end
+    end)
+    res = router_do_map(cg.router, {bid1, bid2}, {},
+                        {timeout = vtest.wait_timeout, return_raw = true})
+    t.assert_equals(res.val, nil)
+    t.assert_covers(res.err, {
+        code = box.error.PROC_LUA,
+        type = 'ClientError',
+        message = 'map_err'
+    }, 'error object')
+    t.assert_equals(res.err_uuid, cg.rs2_uuid, 'error uuid')
+    --
+    -- Cleanup.
+    --
+    cg.replica_2_a:exec(function()
+        _G.do_map = _G.old_do_map
+        _G.old_do_map = nil
+    end)
+end
diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua
index f9cd6920..022f578c 100644
--- a/test/router-luatest/router_test.lua
+++ b/test/router-luatest/router_test.lua
@@ -288,21 +288,16 @@ end
 
 g.test_group_buckets = function(g)
     local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2)
-
-    local res = g.router:exec(function(bid1, bid2)
-        local val, err = ivshard.router._buckets_group({bid2, bid1, bid1})
-        return {
-            val = val,
-            err = err,
-        }
+    local val, err = g.router:exec(function(bid1, bid2)
+        return ivshard.router._buckets_group({bid2, bid1, bid1, bid2},
+                                             ivtest.wait_timeout)
     end, {bids[1], bids[2]})
-    assert(res.err == nil)
+    assert(err == nil)
     local rs1_uuid = g.replica_1_a:replicaset_uuid()
     local expected = {
-        [rs1_uuid] = {bids[1], bids[2]},
+        [rs1_uuid] = {bids[2], bids[1]},
     }
-    table.sort(expected[rs1_uuid])
-    t.assert_equals(res.val, expected)
+    t.assert_equals(val, expected)
 end
 
 g.test_uri_compare_and_reuse = function(g)
diff --git a/test/router/router.result b/test/router/router.result
index 8f678564..9ad9bce1 100644
--- a/test/router/router.result
+++ b/test/router/router.result
@@ -1503,6 +1503,7 @@ error_messages
   - Use replicaset:update_master(...) instead of replicaset.update_master(...)
   - Use replicaset:wait_connected(...) instead of replicaset.wait_connected(...)
   - Use replicaset:wait_connected_all(...) instead of replicaset.wait_connected_all(...)
+  - Use replicaset:wait_master(...) instead of replicaset.wait_master(...)
 ...
 _, replica = next(replicaset.replicas)
 ---
diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua
index 9e760ebe..0e8d8bd4 100644
--- a/test/storage-luatest/storage_1_test.lua
+++ b/test/storage-luatest/storage_1_test.lua
@@ -226,33 +226,34 @@ test_group.test_bucket_skip_validate = function(g)
     end)
 end
 
-test_group.test_ref_with_lookup = function(g)
+test_group.test_ref_with_buckets = function(g)
     g.replica_1_a:exec(function()
+        local lref = require('vshard.storage.ref')
         local res, err, _
-        local timeout = 0.1
         local rid = 42
         local bids = _G.get_n_buckets(2)
-        local bid_extra = 3001
+        local bucket_count = ivshard.storage.internal.total_bucket_count
 
-        -- Check for a single bucket.
+        -- No buckets.
         res, err = ivshard.storage._call(
-            'storage_ref_with_lookup',
-            rid,
-            timeout,
-            {bids[1]}
-        )
+            'storage_ref_with_buckets', rid, iwait_timeout, {})
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(res, {moved = {}})
+        ilt.assert_equals(lref.count, 0)
+
+        -- Check for a single ok bucket.
+        res, err = ivshard.storage._call(
+            'storage_ref_with_buckets', rid, iwait_timeout, {bids[1]})
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {rid = rid, moved = {}})
+        ilt.assert_equals(lref.count, 1)
         _, err = ivshard.storage._call('storage_unref', rid)
         ilt.assert_equals(err, nil)
+        ilt.assert_equals(lref.count, 0)
 
-        -- Check for multiple buckets.
+        -- Check for multiple ok buckets.
         res, err = ivshard.storage._call(
-            'storage_ref_with_lookup',
-            rid,
-            timeout,
-            {bids[1], bids[2]}
-        )
+            'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[2]})
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {rid = rid, moved = {}})
         _, err = ivshard.storage._call('storage_unref', rid)
@@ -260,83 +261,69 @@ test_group.test_ref_with_lookup = function(g)
 
         -- Check for double referencing.
         res, err = ivshard.storage._call(
-            'storage_ref_with_lookup',
-            rid,
-            timeout,
-            {bids[1], bids[1]}
-        )
+            'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[1]})
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {rid = rid, moved = {}})
+        ilt.assert_equals(lref.count, 1)
         _, err = ivshard.storage._call('storage_unref', rid)
         ilt.assert_equals(err, nil)
-        _, err = ivshard.storage._call('storage_unref', rid)
-        t.assert_str_contains(err.message,
-                              'Can not delete a storage ref: no ref')
+        ilt.assert_equals(lref.count, 0)
 
-        -- Check for an absent bucket.
+        -- Bucket mix.
         res, err = ivshard.storage._call(
-            'storage_ref_with_lookup',
-            rid,
-            timeout,
-            {bids[1], bids[2], bid_extra}
-        )
+            'storage_ref_with_buckets', rid, iwait_timeout,
+            {bucket_count + 1, bids[1], bucket_count + 2, bids[2],
+             bucket_count + 3})
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {rid = rid, moved = {bid_extra}})
+        ilt.assert_equals(res, {
+            rid = rid,
+            moved = {bucket_count + 1, bucket_count + 2, bucket_count + 3}
+        })
         _, err = ivshard.storage._call('storage_unref', rid)
         ilt.assert_equals(err, nil)
 
-        -- Check that we do not create a reference if there are no buckets.
+        -- No ref when all buckets are missing.
         res, err = ivshard.storage._call(
-            'storage_ref_with_lookup',
+            'storage_ref_with_buckets',
             rid,
-            timeout,
-            {bid_extra}
+            iwait_timeout,
+            {bucket_count + 1, bucket_count + 2}
         )
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {rid = nil, moved = {bid_extra}})
-        res, err = ivshard.storage._call('storage_unref', rid)
-        t.assert_str_contains(err.message,
-                              'Can not delete a storage ref: no ref')
-        ilt.assert_equals(res, nil)
-
-        -- Check for a timeout.
-        -- Emulate a case when all buckets are not writable.
-        local func = ivshard.storage.internal.bucket_are_all_rw
-        ivshard.storage.internal.bucket_are_all_rw = function() return false end
+        ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}})
+        ilt.assert_equals(lref.count, 0)
+
+        -- Timeout when some buckets aren't writable. Doesn't have to be the
+        -- same buckets as for moving.
+        _G.bucket_recovery_pause()
+        box.space._bucket:update(
+            {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}})
         res, err = ivshard.storage._call(
-            'storage_ref_with_lookup',
-            rid,
-            timeout,
-            {bids[1]}
-        )
-        ivshard.storage.internal.bucket_are_all_rw = func
+            'storage_ref_with_buckets', rid, 0.01, {bids[2]})
+        box.space._bucket:update(
+            {bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        _G.bucket_recovery_continue()
         t.assert_str_contains(err.message, 'Timeout exceeded')
         ilt.assert_equals(res, nil)
-        -- Check that the reference was not created.
-        res, err = ivshard.storage._call('storage_unref', rid)
-        t.assert_str_contains(err.message,
-                              'Can not delete a storage ref: no ref')
-        ilt.assert_equals(res, nil)
+        ilt.assert_equals(lref.count, 0)
     end)
 end
 
-test_group.test_absent_buckets = function(g)
+test_group.test_moved_buckets = function(g)
     g.replica_1_a:exec(function()
         local res, err = ivshard.storage._call(
-            'storage_absent_buckets',
+            'moved_buckets',
             {_G.get_first_bucket()}
         )
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {moved = {}})
-    end)
 
-    g.replica_1_a:exec(function()
-        local bid_extra = 3001
-        local res, err = ivshard.storage._call(
-            'storage_absent_buckets',
-            {_G.get_first_bucket(), bid_extra}
+        local bucket_count = ivshard.storage.internal.total_bucket_count
+        res, err = ivshard.storage._call(
+            'moved_buckets',
+            {_G.get_first_bucket(), bucket_count + 1, bucket_count + 2}
         )
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {moved = {bid_extra}})
+        ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}})
     end)
 end
diff --git a/test/upgrade/upgrade.result b/test/upgrade/upgrade.result
index c5b99ba0..5e8e71f5 100644
--- a/test/upgrade/upgrade.result
+++ b/test/upgrade/upgrade.result
@@ -174,13 +174,13 @@ vshard.storage._call('test_api', 1, 2, 3)
  | - - bucket_recv
  |   - bucket_test_gc
  |   - info
+ |   - moved_buckets
  |   - rebalancer_apply_routes
  |   - rebalancer_request_state
  |   - recovery_bucket_stat
- |   - storage_absent_buckets
  |   - storage_map
  |   - storage_ref
- |   - storage_ref_with_lookup
+ |   - storage_ref_with_buckets
  |   - storage_unref
  |   - test_api
  | - 1
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
index 086ed4ec..4f93bdee 100644
--- a/vshard/replicaset.lua
+++ b/vshard/replicaset.lua
@@ -1514,15 +1514,31 @@ local function buildall(sharding_cfg)
     return new_replicasets
 end
 
---
--- Wait for masters connection during RECONNECT_TIMEOUT seconds.
---
-local function wait_masters_connect(replicasets)
-    for _, rs in pairs(replicasets) do
-        if rs.master then
-            rs.master.conn:wait_connected(consts.RECONNECT_TIMEOUT)
+local function wait_masters_connect(replicasets, timeout)
+    local err, err_id
+    -- Start connecting to all masters in parallel.
+    local deadline = fiber_clock() + timeout
+    for _, replicaset in pairs(replicasets) do
+        local master
+        master, err = replicaset:wait_master(timeout)
+        if not master then
+            err_id = replicaset.id
+            goto fail
+        end
+        replicaset:connect_replica(master)
+        timeout = deadline - fiber_clock()
+    end
+    -- Wait until all connections are established.
+    for _, replicaset in pairs(replicasets) do
+        timeout, err = replicaset:wait_connected(timeout)
+        if not timeout then
+            err_id = replicaset.id
+            goto fail
         end
     end
+    do return timeout end
+    ::fail::
+    return nil, err, err_id
 end
 
 return {
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 48bd9f4e..4fbb7432 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -225,32 +225,24 @@ end
 
 -- Group bucket ids by replicasets according to the router cache.
 local function buckets_group(router, bucket_ids, timeout)
-    timeout = timeout or consts.CALL_TIMEOUT_MIN
     local deadline = fiber_clock() + timeout
     local replicaset, err
     local replicaset_buckets = {}
-    local prev_id = nil
     local buckets, id
-
-    -- Sort buckets to skip duplicates.
-    table.sort(bucket_ids)
+    local bucket_map = {}
     for _, bucket_id in pairs(bucket_ids) do
-        if bucket_id == prev_id then
+        if bucket_map[bucket_id] then
             goto continue
         end
-
+        -- Avoid duplicates.
+        bucket_map[bucket_id] = true
         if fiber_clock() > deadline then
             return nil, lerror.timeout()
         end
-
         replicaset, err = bucket_resolve(router, bucket_id)
         if err ~= nil then
             return nil, err
         end
-        if replicaset == nil then
-            return nil, lerror.vshard(lerror.code.NO_ROUTE_TO_BUCKET, bucket_id)
-        end
-
         id = replicaset.id
         buckets = replicaset_buckets[id]
         if buckets then
@@ -258,9 +250,7 @@ local function buckets_group(router, bucket_ids, timeout)
         else
             replicaset_buckets[id] = {bucket_id}
         end
-
         ::continue::
-        prev_id = bucket_id
     end
 
     return replicaset_buckets
@@ -804,37 +794,6 @@ local function router_call(router, bucket_id, opts, ...)
                             ...)
 end
 
-local function wait_connected_to_masters(replicasets, timeout)
-    local master, id
-    local err, err_id
-
-    -- Start connecting to all masters in parallel.
-    local deadline = fiber_clock() + timeout
-    for _, replicaset in pairs(replicasets) do
-        master, err = replicaset:wait_master(timeout)
-        id = replicaset.id
-        if not master then
-            err_id = id
-            goto fail
-        end
-        replicaset:connect_replica(master)
-        timeout = deadline - fiber_clock()
-    end
-
-    -- Wait until all connections are established.
-    for _, replicaset in pairs(replicasets) do
-        timeout, err = replicaset:wait_connected(timeout)
-        if not timeout then
-            err_id = replicaset.id
-            goto fail
-        end
-    end
-    do return timeout end
-
-    ::fail::
-    return nil, err, err_id
-end
-
 --
 -- Perform map-reduce stages on the given set of replicasets. The map expects a
 -- valid ref ID which must be done beforehand.
@@ -989,11 +948,8 @@ local function router_map_callrw(router, func, args, opts)
     --
     -- Netbox async requests work only with active connections. Need to wait
     -- for the connection explicitly.
-    local rs_list = table_new(0, #replicasets)
-    for _, rs in pairs(replicasets) do
-        table.insert(rs_list, rs)
-    end
-    timeout, err, err_id = wait_connected_to_masters(rs_list, timeout)
+    timeout, err, err_id = lreplicaset.wait_masters_connect(
+        replicasets, timeout)
     if not timeout then
         goto fail
     end
@@ -1090,18 +1046,22 @@ end
 --     though all replicasets were scanned.
 --
 local function router_map_part_callrw(router, bucket_ids, func, args, opts)
-    local replicaset
     local grouped_buckets
-    local err, err_id, res, ok, map
-    local call_args
-    local rs_list
-    local replicasets = {}
-    local preallocated = false
+    local group_count
+    local err, err_id, res, map
+    local replicasets_to_map = {}
     local futures = {}
-    local call_opts = {is_async = true}
-    local rs_count = 0
-    local timeout = opts and opts.timeout or consts.CALL_TIMEOUT_MIN
+    local do_return_raw
+    local timeout
+    if opts then
+        timeout = opts.timeout or consts.CALL_TIMEOUT_MIN
+        do_return_raw = opts.return_raw
+    else
+        timeout = consts.CALL_TIMEOUT_MIN
+    end
+    local opts_async = {is_async = true}
     local deadline = fiber_clock() + timeout
+    local replicasets_all = router.replicasets
     local rid = M.ref_id
     M.ref_id = rid + 1
 
@@ -1109,7 +1069,7 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts)
     -- netbox requests return box.NULL instead of nils.
 
     -- Ref stage.
-    while #bucket_ids > 0 do
+    while next(bucket_ids) do
         -- Group the buckets by replicasets according to the router cache.
         grouped_buckets, err = buckets_group(router, bucket_ids, timeout)
         if grouped_buckets == nil then
@@ -1119,37 +1079,35 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts)
 
         -- Netbox async requests work only with active connections.
         -- So, first need to wait for the master connection explicitly.
-        rs_list = table_new(0, #grouped_buckets)
+        local replicasets_to_check = {}
+        group_count = 0
         for uuid, _ in pairs(grouped_buckets) do
-            replicaset = router.replicasets[uuid]
-            table.insert(rs_list, replicaset)
+            group_count = group_count + 1
+            table.insert(replicasets_to_check, replicasets_all[uuid])
         end
-        timeout, err, err_id = wait_connected_to_masters(rs_list, timeout)
+        timeout, err, err_id = lreplicaset.wait_masters_connect(
+            replicasets_to_check, timeout)
         if not timeout then
             goto fail
         end
 
         -- Send ref requests with timeouts to the replicasets.
-        futures = table_new(0, #grouped_buckets)
+        futures = table_new(0, group_count)
         for id, buckets in pairs(grouped_buckets) do
-            replicaset = router.replicasets[id]
-            -- Netbox async requests work only with active connections. Need to
-            -- wait for the connection explicitly.
-            timeout, err = replicaset:wait_connected(timeout)
             if timeout == nil then
                 err_id = id
                 goto fail
             end
-            if replicasets[id] then
-                -- Replicaset is already referred on the previous iteration.
+            local args_ref
+            if replicasets_to_map[id] then
+                -- Replicaset is already referenced on a previous iteration.
                 -- Simply get the moved buckets without double referencing.
-                call_args = {'storage_absent_buckets', buckets}
+                args_ref = {'moved_buckets', buckets}
             else
-                call_args = {'storage_ref_with_lookup', rid, timeout, buckets}
-                rs_count = rs_count + 1
+                args_ref = {'storage_ref_with_buckets', rid, timeout, buckets}
             end
-            res, err = replicaset:callrw('vshard.storage._call', call_args,
-                                         call_opts)
+            res, err = replicasets_all[id]:callrw('vshard.storage._call',
+                                                  args_ref, opts_async)
             if res == nil then
                 err_id = id
                 goto fail
@@ -1157,17 +1115,10 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts)
             futures[id] = res
         end
 
-        if not preallocated then
-            -- Current preallocation works only for the first iteration
-            -- (i.e. router cache is not outdated).
-            replicasets = table_new(0, rs_count)
-            preallocated = true
-        end
-
         -- Wait for the refs to be done and collect moved buckets.
         bucket_ids = {}
-        for id, future in pairs(futures) do
-            res, err = future_wait(future, timeout)
+        for id, f in pairs(futures) do
+            res, err = future_wait(f, timeout)
             -- Handle netbox error first.
             if res == nil then
                 err_id = id
@@ -1184,71 +1135,23 @@ local function router_map_part_callrw(router, bucket_ids, func, args, opts)
                 table.insert(bucket_ids, bucket_id)
             end
             if res.rid then
-                -- If there are no buckets on the replicaset,
-                -- it would not be referred.
-                replicasets[id] = router.replicasets[id]
+                assert(not replicasets_to_map[id])
+                -- If there are no buckets on the replicaset, it would not be
+                -- referenced.
+                replicasets_to_map[id] = replicasets_all[id]
             end
             timeout = deadline - fiber_clock()
         end
     end
-
-    -- Map stage.
-    map = table_new(0, rs_count)
-    futures = table_new(0, rs_count)
-    args = {'storage_map', rid, func, args}
-    -- Send map requests.
-    for id, rs in pairs(replicasets) do
-        res, err = rs:callrw('vshard.storage._call', args, call_opts)
-        if res == nil then
-            err_id = id
-            goto fail
-        end
-        futures[id] = res
-    end
-
-    -- Reduce stage.
-    -- Collect map responses (refs were deleted by the storages for non-error
-    -- results).
-    for id, future in pairs(futures) do
-        res, err = future_wait(future, timeout)
-        if res == nil then
-            err_id = id
-            goto fail
-        end
-        -- Map returns true,res or nil,err.
-        ok, res = res[1], res[2]
-        if ok == nil then
-            err = res
-            err_id = id
-            goto fail
-        end
-        if res ~= nil then
-            -- Store as a table so in future it could be extended for
-            -- multireturn.
-            map[id] = {res}
-        end
-        timeout = deadline - fiber_clock()
+    map, err, err_id = replicasets_map_reduce(
+        replicasets_to_map, rid, func, args, {
+            timeout = timeout, return_raw = do_return_raw
+        })
+    if map then
+        return map
     end
-    do return map end
-
 ::fail::
-    local f
-    for id, future in pairs(futures) do
-        future:discard()
-        -- Best effort to remove the created refs before exiting. Can help if
-        -- the timeout was big and the error happened early.
-        call_args = {'storage_unref', rid}
-        replicaset = router.replicasets[id]
-        if replicaset then
-            f = replicaset:callrw('vshard.storage._call', call_args, call_opts)
-            if f ~= nil then
-                -- Don't care waiting for a result - no time for this. But it
-                -- won't affect the request sending if the connection is still
-                -- alive.
-                f:discard()
-            end
-        end
-    end
+    replicasets_map_cancel_refs(replicasets_all, futures, rid)
     err = lerror.make(err)
     return nil, err, err_id
 end
@@ -1652,7 +1555,7 @@ local function router_cfg(router, cfg, is_reload)
     for _, replicaset in pairs(new_replicasets) do
         replicaset:connect_all()
     end
-    lreplicaset.wait_masters_connect(new_replicasets)
+    lreplicaset.wait_masters_connect(new_replicasets, consts.RECONNECT_TIMEOUT)
     lreplicaset.outdate_replicasets(router.replicasets,
                                     vshard_cfg.connection_outdate_delay)
     router.connection_outdate_delay = vshard_cfg.connection_outdate_delay
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index 2fbfba27..a8fa3a07 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -3187,7 +3187,7 @@ end
 -- @return A dictionary with a list of bucket identifiers which are
 --         not present on the current instance.
 --
-local function storage_absent_buckets(bucket_ids)
+local function storage_moved_buckets(bucket_ids)
     local status = consts.BUCKET
     local moved_buckets = {}
     for _, bucket_id in pairs(bucket_ids) do
@@ -3212,31 +3212,13 @@ end
 --         are absent, the reference is not created and a nil reference id
 --         with the list of absent buckets is returned.
 --
-local function storage_ref_with_lookup(rid, timeout, bucket_ids)
-    local moved = storage_absent_buckets(bucket_ids).moved
+local function storage_ref_with_buckets(rid, timeout, bucket_ids)
+    local moved = storage_moved_buckets(bucket_ids).moved
     if #moved == #bucket_ids then
-        -- Take an advantage that moved buckets are returned in the same
-        -- order as in the input list.
-        local do_match = true
-        local next_moved = next(moved)
-        local next_passed = next(bucket_ids)
-        ::continue::
-        if next_moved then
-            if next_moved == next_passed then
-                next_moved = next(moved, next_moved)
-                next_passed = next(bucket_ids, next_passed)
-                goto continue
-            else
-                do_match = false
-            end
-        end
-        if do_match then
-            -- If all the passed buckets are absent, there is no need
-            -- to create a ref.
-            return {rid = nil, moved = moved}
-        end
+        -- If all the passed buckets are absent, there is no need to create a
+        -- ref.
+        return {rid = nil, moved = moved}
     end
-
     local ok, err = storage_ref(rid, timeout)
     if not ok then
         return nil, err
@@ -3298,9 +3280,9 @@ service_call_api = setmetatable({
     rebalancer_apply_routes = rebalancer_apply_routes,
     rebalancer_request_state = rebalancer_request_state,
     recovery_bucket_stat = recovery_bucket_stat,
-    storage_absent_buckets = storage_absent_buckets,
+    moved_buckets = storage_moved_buckets,
     storage_ref = storage_ref,
-    storage_ref_with_lookup = storage_ref_with_lookup,
+    storage_ref_with_buckets = storage_ref_with_buckets,
     storage_unref = storage_unref,
     storage_map = storage_map,
     info = storage_service_info,

From b2c3c6e140040d45753b5ab4b6e55b4afdd29df3 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Thu, 3 Oct 2024 12:11:26 +0200
Subject: [PATCH 11/17] router: move Ref stage of Map-Reduce into new func

There are 2 Ref-Map-Reduce functions right now - map_callrw() and
map_part_callrw(). Their only difference is that the former refs
the whole cluster, while the latter refs only a subset of
storages. The rest is the same.

There is an idea, that better lets merge these functions into one
and make the bucket IDs an option.

The commit extracts the Ref stages of both functions into separate
helpers which will allow to keep this future single function very
short and simple.

NO_DOC=internal
NO_TEST=refactoring
---
 vshard/router/init.lua | 362 ++++++++++++++++++++++-------------------
 1 file changed, 195 insertions(+), 167 deletions(-)

diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 4fbb7432..d37890d4 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -794,6 +794,178 @@ local function router_call(router, bucket_id, opts, ...)
                             ...)
 end
 
+--
+-- Perform Ref stage of the Ref-Map-Reduce process on all the known replicasets.
+--
+local function router_ref_storage_all(router, timeout)
+    local replicasets = router.replicasets
+    local deadline = fiber_clock() + timeout
+    local err, err_id, res
+    local futures = {}
+    local bucket_count = 0
+    local opts_async = {is_async = true}
+    local rs_count = 0
+    local rid = M.ref_id
+    M.ref_id = rid + 1
+    -- Nil checks are done explicitly here (== nil instead of 'not'), because
+    -- netbox requests return box.NULL instead of nils.
+
+    --
+    -- Ref stage: send.
+    --
+    -- Netbox async requests work only with active connections. Need to wait
+    -- for the connection explicitly.
+    timeout, err, err_id = lreplicaset.wait_masters_connect(
+        replicasets, timeout)
+    if not timeout then
+        goto fail
+    end
+    for id, rs in pairs(replicasets) do
+        res, err = rs:callrw('vshard.storage._call',
+                              {'storage_ref', rid, timeout}, opts_async)
+        if res == nil then
+            err_id = id
+            goto fail
+        end
+        futures[id] = res
+        rs_count = rs_count + 1
+    end
+    --
+    -- Ref stage: collect.
+    --
+    for id, future in pairs(futures) do
+        res, err = future_wait(future, timeout)
+        -- Handle netbox error first.
+        if res == nil then
+            err_id = id
+            goto fail
+        end
+        -- Ref returns nil,err or bucket count.
+        res, err = res[1], res[2]
+        if res == nil then
+            err_id = id
+            goto fail
+        end
+        bucket_count = bucket_count + res
+        timeout = deadline - fiber_clock()
+    end
+    -- All refs are done but not all buckets are covered. This is odd and can
+    -- mean many things. The most possible ones: 1) outdated configuration on
+    -- the router and it does not see another replicaset with more buckets,
+    -- 2) some buckets are simply lost or duplicated - could happen as a bug, or
+    -- if the user does a maintenance of some kind by creating/deleting buckets.
+    -- In both cases can't guarantee all the data would be covered by Map calls.
+    if bucket_count ~= router.total_bucket_count then
+        err = lerror.vshard(lerror.code.UNKNOWN_BUCKETS,
+                            router.total_bucket_count - bucket_count)
+        goto fail
+    end
+    do return timeout, nil, nil, rid, futures, replicasets end
+
+    ::fail::
+    return nil, err, err_id, rid, futures
+end
+
+--
+-- Perform Ref stage of the Ref-Map-Reduce process on a subset of all the
+-- replicasets, which contains all the listed bucket IDs.
+--
+local function router_ref_storage_by_buckets(router, bucket_ids, timeout)
+    local grouped_buckets
+    local group_count
+    local err, err_id, res
+    local replicasets_all = router.replicasets
+    local replicasets_to_map = {}
+    local futures = {}
+    local opts_async = {is_async = true}
+    local deadline = fiber_clock() + timeout
+    local rid = M.ref_id
+    M.ref_id = rid + 1
+
+    -- Nil checks are done explicitly here (== nil instead of 'not'), because
+    -- netbox requests return box.NULL instead of nils.
+
+    -- Ref stage.
+    while next(bucket_ids) do
+        -- Group the buckets by replicasets according to the router cache.
+        grouped_buckets, err = buckets_group(router, bucket_ids, timeout)
+        if grouped_buckets == nil then
+            goto fail
+        end
+        timeout = deadline - fiber_clock()
+
+        -- Netbox async requests work only with active connections.
+        -- So, first need to wait for the master connection explicitly.
+        local replicasets_to_check = {}
+        group_count = 0
+        for uuid, _ in pairs(grouped_buckets) do
+            group_count = group_count + 1
+            table.insert(replicasets_to_check, replicasets_all[uuid])
+        end
+        timeout, err, err_id = lreplicaset.wait_masters_connect(
+            replicasets_to_check, timeout)
+        if not timeout then
+            goto fail
+        end
+
+        -- Send ref requests with timeouts to the replicasets.
+        futures = table_new(0, group_count)
+        for id, buckets in pairs(grouped_buckets) do
+            if timeout == nil then
+                err_id = id
+                goto fail
+            end
+            local args_ref
+            if replicasets_to_map[id] then
+                -- Replicaset is already referenced on a previous iteration.
+                -- Simply get the moved buckets without double referencing.
+                args_ref = {'moved_buckets', buckets}
+            else
+                args_ref = {'storage_ref_with_buckets', rid, timeout, buckets}
+            end
+            res, err = replicasets_all[id]:callrw('vshard.storage._call',
+                                                  args_ref, opts_async)
+            if res == nil then
+                err_id = id
+                goto fail
+            end
+            futures[id] = res
+        end
+
+        -- Wait for the refs to be done and collect moved buckets.
+        bucket_ids = {}
+        for id, f in pairs(futures) do
+            res, err = future_wait(f, timeout)
+            -- Handle netbox error first.
+            if res == nil then
+                err_id = id
+                goto fail
+            end
+            -- Ref returns nil,err or {rid, moved}.
+            res, err = res[1], res[2]
+            if res == nil then
+                err_id = id
+                goto fail
+            end
+            for _, bucket_id in pairs(res.moved) do
+                bucket_reset(router, bucket_id)
+                table.insert(bucket_ids, bucket_id)
+            end
+            if res.rid then
+                assert(not replicasets_to_map[id])
+                -- If there are no buckets on the replicaset, it would not be
+                -- referenced.
+                replicasets_to_map[id] = replicasets_all[id]
+            end
+            timeout = deadline - fiber_clock()
+        end
+    end
+    do return timeout, nil, nil, rid, futures, replicasets_to_map end
+
+    ::fail::
+    return nil, err, err_id, rid, futures
+end
+
 --
 -- Perform map-reduce stages on the given set of replicasets. The map expects a
 -- valid ref ID which must be done beforehand.
@@ -923,83 +1095,25 @@ end
 --     found even though all replicasets were scanned.
 --
 local function router_map_callrw(router, func, args, opts)
-    local replicasets = router.replicasets
-    local timeout
-    local do_return_raw
+    local replicasets, err, err_id, map, futures, rid
+    local timeout, do_return_raw
     if opts then
         timeout = opts.timeout or consts.CALL_TIMEOUT_MIN
         do_return_raw = opts.return_raw
     else
         timeout = consts.CALL_TIMEOUT_MIN
     end
-    local deadline = fiber_clock() + timeout
-    local err, err_id, res, map
-    local futures = {}
-    local bucket_count = 0
-    local opts_async = {is_async = true}
-    local rs_count = 0
-    local rid = M.ref_id
-    M.ref_id = rid + 1
-    -- Nil checks are done explicitly here (== nil instead of 'not'), because
-    -- netbox requests return box.NULL instead of nils.
-
-    --
-    -- Ref stage: send.
-    --
-    -- Netbox async requests work only with active connections. Need to wait
-    -- for the connection explicitly.
-    timeout, err, err_id = lreplicaset.wait_masters_connect(
-        replicasets, timeout)
-    if not timeout then
-        goto fail
-    end
-    for id, rs in pairs(replicasets) do
-        res, err = rs:callrw('vshard.storage._call',
-                              {'storage_ref', rid, timeout}, opts_async)
-        if res == nil then
-            err_id = id
-            goto fail
+    timeout, err, err_id, rid, futures, replicasets =
+        router_ref_storage_all(router, timeout)
+    if timeout then
+        map, err, err_id = replicasets_map_reduce(replicasets, rid, func,
+            args, {
+                timeout = timeout, return_raw = do_return_raw
+            })
+        if map then
+            return map
         end
-        futures[id] = res
-        rs_count = rs_count + 1
     end
-    --
-    -- Ref stage: collect.
-    --
-    for id, future in pairs(futures) do
-        res, err = future_wait(future, timeout)
-        -- Handle netbox error first.
-        if res == nil then
-            err_id = id
-            goto fail
-        end
-        -- Ref returns nil,err or bucket count.
-        res, err = res[1], res[2]
-        if res == nil then
-            err_id = id
-            goto fail
-        end
-        bucket_count = bucket_count + res
-        timeout = deadline - fiber_clock()
-    end
-    -- All refs are done but not all buckets are covered. This is odd and can
-    -- mean many things. The most possible ones: 1) outdated configuration on
-    -- the router and it does not see another replicaset with more buckets,
-    -- 2) some buckets are simply lost or duplicated - could happen as a bug, or
-    -- if the user does a maintenance of some kind by creating/deleting buckets.
-    -- In both cases can't guarantee all the data would be covered by Map calls.
-    if bucket_count ~= router.total_bucket_count then
-        err = lerror.vshard(lerror.code.UNKNOWN_BUCKETS,
-                            router.total_bucket_count - bucket_count)
-        goto fail
-    end
-    map, err, err_id = replicasets_map_reduce(replicasets, rid, func, args, {
-        timeout = timeout, return_raw = do_return_raw
-    })
-    if map then
-        return map
-    end
-::fail::
     replicasets_map_cancel_refs(replicasets, futures, rid)
     err = lerror.make(err)
     return nil, err, err_id
@@ -1046,112 +1160,26 @@ end
 --     though all replicasets were scanned.
 --
 local function router_map_part_callrw(router, bucket_ids, func, args, opts)
-    local grouped_buckets
-    local group_count
-    local err, err_id, res, map
-    local replicasets_to_map = {}
-    local futures = {}
-    local do_return_raw
-    local timeout
+    local timeout, err, err_id, map, futures, do_return_raw, rid
+    local replicasets_to_map
     if opts then
         timeout = opts.timeout or consts.CALL_TIMEOUT_MIN
         do_return_raw = opts.return_raw
     else
         timeout = consts.CALL_TIMEOUT_MIN
     end
-    local opts_async = {is_async = true}
-    local deadline = fiber_clock() + timeout
-    local replicasets_all = router.replicasets
-    local rid = M.ref_id
-    M.ref_id = rid + 1
-
-    -- Nil checks are done explicitly here (== nil instead of 'not'), because
-    -- netbox requests return box.NULL instead of nils.
-
-    -- Ref stage.
-    while next(bucket_ids) do
-        -- Group the buckets by replicasets according to the router cache.
-        grouped_buckets, err = buckets_group(router, bucket_ids, timeout)
-        if grouped_buckets == nil then
-            goto fail
-        end
-        timeout = deadline - fiber_clock()
-
-        -- Netbox async requests work only with active connections.
-        -- So, first need to wait for the master connection explicitly.
-        local replicasets_to_check = {}
-        group_count = 0
-        for uuid, _ in pairs(grouped_buckets) do
-            group_count = group_count + 1
-            table.insert(replicasets_to_check, replicasets_all[uuid])
+    timeout, err, err_id, rid, futures, replicasets_to_map =
+        router_ref_storage_by_buckets(router, bucket_ids, timeout)
+    if timeout then
+        map, err, err_id = replicasets_map_reduce(
+            replicasets_to_map, rid, func, args, {
+                timeout = timeout, return_raw = do_return_raw
+            })
+        if map then
+            return map
         end
-        timeout, err, err_id = lreplicaset.wait_masters_connect(
-            replicasets_to_check, timeout)
-        if not timeout then
-            goto fail
-        end
-
-        -- Send ref requests with timeouts to the replicasets.
-        futures = table_new(0, group_count)
-        for id, buckets in pairs(grouped_buckets) do
-            if timeout == nil then
-                err_id = id
-                goto fail
-            end
-            local args_ref
-            if replicasets_to_map[id] then
-                -- Replicaset is already referenced on a previous iteration.
-                -- Simply get the moved buckets without double referencing.
-                args_ref = {'moved_buckets', buckets}
-            else
-                args_ref = {'storage_ref_with_buckets', rid, timeout, buckets}
-            end
-            res, err = replicasets_all[id]:callrw('vshard.storage._call',
-                                                  args_ref, opts_async)
-            if res == nil then
-                err_id = id
-                goto fail
-            end
-            futures[id] = res
-        end
-
-        -- Wait for the refs to be done and collect moved buckets.
-        bucket_ids = {}
-        for id, f in pairs(futures) do
-            res, err = future_wait(f, timeout)
-            -- Handle netbox error first.
-            if res == nil then
-                err_id = id
-                goto fail
-            end
-            -- Ref returns nil,err or {rid, moved}.
-            res, err = res[1], res[2]
-            if res == nil then
-                err_id = id
-                goto fail
-            end
-            for _, bucket_id in pairs(res.moved) do
-                bucket_reset(router, bucket_id)
-                table.insert(bucket_ids, bucket_id)
-            end
-            if res.rid then
-                assert(not replicasets_to_map[id])
-                -- If there are no buckets on the replicaset, it would not be
-                -- referenced.
-                replicasets_to_map[id] = replicasets_all[id]
-            end
-            timeout = deadline - fiber_clock()
-        end
-    end
-    map, err, err_id = replicasets_map_reduce(
-        replicasets_to_map, rid, func, args, {
-            timeout = timeout, return_raw = do_return_raw
-        })
-    if map then
-        return map
     end
-::fail::
-    replicasets_map_cancel_refs(replicasets_all, futures, rid)
+    replicasets_map_cancel_refs(router.replicasets, futures, rid)
     err = lerror.make(err)
     return nil, err, err_id
 end

From c72abb982aea27c78f62455f18eb331babf10bd7 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Thu, 3 Oct 2024 12:18:58 +0200
Subject: [PATCH 12/17] router: merge map_callrw and map_part_callrw

The behavior is regulated with the new bucket_ids option.

@TarantoolBot document
Title: vshard: `bucket_ids` option for `router.map_callrw()`

The option is an array of numeric bucket IDs. When specified, the
Ref-Map-Reduce is only performed on the masters having at least
one of these buckets. By default all the stages are done on all
masters in the cluster.

Example:
```Lua
-- Assume buckets 1, 2, 3 cover replicasets UUID_A and UUID_B.
res, err = vshard.router.map_callrw(func, args, {bucket_ids = {1, 2, 3}})
assert(res[UUID_A] == {func_result_from_A})
assert(res[UUID_B] == {func_result_from_B})
```
---
 test/router-luatest/map_part_test.lua |  94 ++++++++++++++--------
 vshard/router/init.lua                | 108 +++++++-------------------
 2 files changed, 91 insertions(+), 111 deletions(-)

diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua
index 1a606b2c..cdaccd28 100644
--- a/test/router-luatest/map_part_test.lua
+++ b/test/router-luatest/map_part_test.lua
@@ -38,12 +38,12 @@ g.before_all(function(cg)
     cg.router = router
     local res, err = router:exec(function()
         local res, err = ivshard.router.bootstrap({timeout = iwait_timeout})
-        rawset(_G, 'do_map', function(bids, args, opts)
-            local old_bids = table.copy(bids)
-            local val, err, err_uuid = ivshard.router.map_part_callrw(
-                bids, 'do_map', args, opts)
-            -- Make sure the bucket list isn't changed by vshard.
-            ilt.assert_equals(bids, old_bids)
+        rawset(_G, 'do_map', function(args, opts)
+            local old_opts = table.copy(opts)
+            local val, err, err_uuid = ivshard.router.map_callrw(
+                'do_map', args, opts)
+            -- Make sure the options aren't changed by vshard.
+            ilt.assert_equals(old_opts, opts)
             local val_type
             if opts.return_raw and val ~= nil then
                 -- Src+value. The src is plain Lua data. The value is raw.
@@ -82,16 +82,18 @@ g.after_all(function(cg)
     cg.cluster:drop()
 end)
 
-local function router_do_map(router, bids, args, opts)
-    return router:exec(function(bids, args, opts)
-        return _G.do_map(bids, args, opts)
-    end, {bids, args, opts})
+local function router_do_map(router, args, opts)
+    return router:exec(function(args, opts)
+        return _G.do_map(args, opts)
+    end, {args, opts})
 end
 
 g.test_map_part_single_rs = function(cg)
     local bids = vtest.storage_get_n_buckets(cg.replica_1_a, 4)
-    local res = router_do_map(cg.router, {bids[3], bids[2]},
-                              {123}, {timeout = vtest.wait_timeout})
+    local res = router_do_map(cg.router, {123}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bids[3], bids[2]},
+    })
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
     t.assert_equals(res.val, {
@@ -102,8 +104,10 @@ end
 g.test_map_part_multi_rs = function(cg)
     local bid1 = vtest.storage_first_bucket(cg.replica_1_a)
     local bid2 = vtest.storage_first_bucket(cg.replica_2_a)
-    local res = router_do_map(cg.router, {bid1, bid2},
-                              {123}, {timeout = vtest.wait_timeout})
+    local res = router_do_map(cg.router, {123}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bid1, bid2},
+    })
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
     t.assert_equals(res.val, {
@@ -123,8 +127,10 @@ g.test_map_part_ref = function(cg)
         _G.bucket_send(bid2, to)
     end, {bids1[1], bids1[2], cg.rs2_uuid})
     -- The buckets are ACTIVE on rs2, so the partial map should succeed.
-    local res = router_do_map(cg.router, {bids1[1], bids1[2]},
-                              {42}, {timeout = vtest.wait_timeout})
+    local res = router_do_map(cg.router, {42}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bids1[1], bids1[2]},
+    })
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
     t.assert_equals(res.val, {
@@ -133,7 +139,10 @@ g.test_map_part_ref = function(cg)
     -- But if we use some active bucket from rs1, the partial map should fail.
     -- The reason is that the moved buckets are still in the SENT state and
     -- we can't take a ref.
-    res = router_do_map(cg.router, {bids1[3]}, {42}, {timeout = 0.1})
+    res = router_do_map(cg.router, {42}, {
+        timeout = 0.1,
+        bucket_ids = {bids1[3]},
+    })
     t.assert_equals(res.val, nil)
     t.assert(res.err)
     t.assert_equals(res.err_uuid, cg.rs1_uuid)
@@ -152,8 +161,10 @@ g.test_map_part_ref = function(cg)
         _G.bucket_send(bid2, to)
     end, {bids1[1], bids1[2], cg.rs1_uuid})
 
-    res = router_do_map(cg.router, {bids1[1], bids1[2]},
-                        {42}, {timeout = vtest.wait_timeout})
+    res = router_do_map(cg.router, {42}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bids1[1], bids1[2]},
+    })
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
     t.assert_equals(res.val, {
@@ -187,8 +198,10 @@ g.test_map_part_double_ref = function(cg)
     -- from rs2. The ref stage should be done in two steps:
     -- 1. ref rs2 and returns the moved bucket;
     -- 2. discover the moved bucket on rs2 and avoid double reference;
-    local res = router_do_map(cg.router, {bid1, bid2},
-                              {42}, {timeout = vtest.wait_timeout})
+    local res = router_do_map(cg.router, {42}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bid1, bid2},
+    })
     t.assert_equals(res.err, nil)
     t.assert_equals(res.err_uuid, nil)
     t.assert_equals(res.val, {
@@ -256,8 +269,10 @@ g.test_map_part_ref_timeout = function(cg)
         end
     end)
     local f = fiber.new(function()
-        return router_do_map(cg.router, {bid1, bid2, bid3, bid4},
-                             {42}, {timeout = vtest.wait_timeout})
+        return router_do_map(cg.router, {42}, {
+            timeout = vtest.wait_timeout,
+            bucket_ids = {bid1, bid2, bid3, bid4},
+        })
     end)
     f:set_joinable(true)
     cg.replica_2_a:exec(function()
@@ -307,8 +322,10 @@ g.test_map_part_map = function(cg)
             return box.error(box.error.PROC_LUA, "map_err")
         end
     end)
-    local res = router_do_map(cg.router, {bid1, bid2},
-                              {3}, {timeout = vtest.wait_timeout})
+    local res = router_do_map(cg.router, {3}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bid1, bid2},
+    })
     t.assert_equals(res.val, nil)
     t.assert_covers(res.err, {
         code = box.error.PROC_LUA,
@@ -325,8 +342,10 @@ g.test_map_part_map = function(cg)
         _G.do_map = _G.old_do_map
         _G.old_do_map = nil
     end)
-    res = router_do_map(cg.router, {bid1, bid2},
-                        {3}, {timeout = vtest.wait_timeout})
+    res = router_do_map(cg.router, {3}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bid1, bid2},
+    })
     t.assert_equals(res.err, nil, res.err)
     t.assert_equals(res.err_uuid, nil)
     t.assert_equals(res.val, {
@@ -342,8 +361,11 @@ g.test_map_part_callrw_raw = function(cg)
     --
     local bid1 = vtest.storage_first_bucket(cg.replica_1_a)
     local bid2 = vtest.storage_first_bucket(cg.replica_2_a)
-    local res = router_do_map(cg.router, {bid1, bid2}, {3},
-                              {timeout = vtest.wait_timeout, return_raw = true})
+    local res = router_do_map(cg.router, {3}, {
+        timeout = vtest.wait_timeout,
+        return_raw = true,
+        bucket_ids = {bid1, bid2},
+    })
     t.assert_equals(res.val, {
         [cg.rs1_uuid] = {{cg.rs1_uuid, 3}},
         [cg.rs2_uuid] = {{cg.rs2_uuid, 3}},
@@ -359,8 +381,11 @@ g.test_map_part_callrw_raw = function(cg)
             return
         end
     end)
-    res = router_do_map(cg.router, {bid1, bid2}, {},
-                        {timeout = vtest.wait_timeout, return_raw = true})
+    res = router_do_map(cg.router, {}, {
+        timeout = vtest.wait_timeout,
+        return_raw = true,
+        bucket_ids = {bid1, bid2},
+    })
     t.assert_equals(res.val, {
         [cg.rs1_uuid] = {{cg.rs1_uuid}},
     })
@@ -372,8 +397,11 @@ g.test_map_part_callrw_raw = function(cg)
             return box.error(box.error.PROC_LUA, "map_err")
         end
     end)
-    res = router_do_map(cg.router, {bid1, bid2}, {},
-                        {timeout = vtest.wait_timeout, return_raw = true})
+    res = router_do_map(cg.router, {}, {
+        timeout = vtest.wait_timeout,
+        return_raw = true,
+        bucket_ids = {bid1, bid2},
+    })
     t.assert_equals(res.val, nil)
     t.assert_covers(res.err, {
         code = box.error.PROC_LUA,
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index d37890d4..0430b596 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -1056,9 +1056,13 @@ local function replicasets_map_cancel_refs(replicasets, futures, rid)
 end
 
 --
--- Consistent Map-Reduce. The given function is called on all masters in the
--- cluster with a guarantee that in case of success it was executed with all
--- buckets being accessible for reads and writes.
+-- Consistent Map-Reduce. The given function is called on masters in the cluster
+-- with a guarantee that in case of success it was executed with all buckets
+-- being accessible for reads and writes.
+--
+-- The selection of masters depends on bucket_ids option. When specified, the
+-- Map-Reduce is performed only on masters having at least one of these buckets.
+-- Otherwise it is executed on all the masters in the cluster.
 --
 -- Consistency in scope of map-reduce means all the data was accessible, and
 -- didn't move during map requests execution. To preserve the consistency there
@@ -1080,11 +1084,19 @@ end
 -- @param router Router instance to use.
 -- @param func Name of the function to call.
 -- @param args Function arguments passed in netbox style (as an array).
--- @param opts Can only contain 'timeout' as a number of seconds. Note that the
---     refs may end up being kept on the storages during this entire timeout if
---     something goes wrong. For instance, network issues appear. This means
---     better not use a value bigger than necessary. A stuck infinite ref can
---     only be dropped by this router restart/reconnect or the storage restart.
+-- @param opts Options. See below:
+--     - timeout - a number of seconds. Note that the refs may end up being kept
+--         on the storages during this entire timeout if something goes wrong.
+--         For instance, network issues appear. This means better not use a
+--         value bigger than necessary. A stuck infinite ref can only be dropped
+--         by this router restart/reconnect or the storage restart.
+--     - return_raw - true/false. When specified, the returned values are not
+--         decoded into Lua native objects and stay packed as a msgpack object
+--         (see 'msgpack' module). By default all is decoded. That might be
+--         undesirable when the returned values are going to be forwarded back
+--         into the network anyway.
+--     - bucket_ids - an array of bucket IDs which have to be covered by
+--         Map-Reduce. By default the whole cluster is covered.
 --
 -- @return In case of success - a map with replicaset ID (UUID or name) keys and
 --     values being what the function returned from the replicaset.
@@ -1095,84 +1107,25 @@ end
 --     found even though all replicasets were scanned.
 --
 local function router_map_callrw(router, func, args, opts)
-    local replicasets, err, err_id, map, futures, rid
-    local timeout, do_return_raw
+    local replicasets_to_map, err, err_id, map, futures, rid
+    local timeout, do_return_raw, bucket_ids
     if opts then
         timeout = opts.timeout or consts.CALL_TIMEOUT_MIN
         do_return_raw = opts.return_raw
+        bucket_ids = opts.bucket_ids
     else
         timeout = consts.CALL_TIMEOUT_MIN
     end
-    timeout, err, err_id, rid, futures, replicasets =
-        router_ref_storage_all(router, timeout)
-    if timeout then
-        map, err, err_id = replicasets_map_reduce(replicasets, rid, func,
-            args, {
-                timeout = timeout, return_raw = do_return_raw
-            })
-        if map then
-            return map
-        end
-    end
-    replicasets_map_cancel_refs(replicasets, futures, rid)
-    err = lerror.make(err)
-    return nil, err, err_id
-end
-
---
--- Partial Ref-Map-Reduce. The given function is called on the masters
--- with a guarantee that in case of success it was executed exactly once
--- on all storages that contain the given buckets. Execution on masters
--- is chosen to allow buckets be accessed for reads and writes during the
--- call.
---
--- The execution consists of the following stages:
---
--- Ref stage. We ref all the storages with the given buckets. It is
--- an iterative process, because the router's cache may be outdated,
--- and we need to discover the moved buckets. At the end of this stage
--- we have refed all the storages containing the given buckets with the
--- given timeout.
---
--- Map stage. We call the given function on all the storages we have
--- refed. On the storage side we use a `storage_map` function, which
--- switches the timeout ref to the manual mode and unrefs the storage
--- after the function is called.
---
--- Reduce stage. We collect the results of the function call.
---
--- @param router Instance to use.
--- @param bucket_ids List of bucket identifiers.
--- @param func Name of the function to call.
--- @param args Function arguments passed in netbox style (as an array).
--- @param opts Can only contain 'timeout' as a number of seconds. Note that the
---     refs may end up being kept on the storages during this entire timeout if
---     something goes wrong. For instance, network issues appear. This means
---     better not use a value bigger than necessary. A stuck infinite ref can
---     only be dropped by this router restart/reconnect or the storage restart.
---
--- @return In case of success - a map with replicaset UUID keys and values being
---     what the function returned from the replicaset.
---
--- @return In case of an error - nil, error object, optional UUID of the
---     replicaset where the error happened. UUID may be not present if it wasn't
---     about concrete replicaset. For example, not all buckets were found even
---     though all replicasets were scanned.
---
-local function router_map_part_callrw(router, bucket_ids, func, args, opts)
-    local timeout, err, err_id, map, futures, do_return_raw, rid
-    local replicasets_to_map
-    if opts then
-        timeout = opts.timeout or consts.CALL_TIMEOUT_MIN
-        do_return_raw = opts.return_raw
+    if bucket_ids then
+        timeout, err, err_id, rid, futures, replicasets_to_map =
+            router_ref_storage_by_buckets(router, bucket_ids, timeout)
     else
-        timeout = consts.CALL_TIMEOUT_MIN
+        timeout, err, err_id, rid, futures, replicasets_to_map =
+            router_ref_storage_all(router, timeout)
     end
-    timeout, err, err_id, rid, futures, replicasets_to_map =
-        router_ref_storage_by_buckets(router, bucket_ids, timeout)
     if timeout then
-        map, err, err_id = replicasets_map_reduce(
-            replicasets_to_map, rid, func, args, {
+        map, err, err_id = replicasets_map_reduce(replicasets_to_map, rid, func,
+            args, {
                 timeout = timeout, return_raw = do_return_raw
             })
         if map then
@@ -2029,7 +1982,6 @@ local router_mt = {
         callre = router_make_api(router_callre),
         callbre = router_make_api(router_callbre),
         map_callrw = router_make_api(router_map_callrw),
-        map_part_callrw = router_make_api(router_map_part_callrw),
         route = router_make_api(router_route),
         routeall = router_make_api(router_routeall),
         bucket_id = router_make_api(router_bucket_id),

From 59deb2824721ae80ab1a1a6a5e4e32a317df08e7 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Thu, 3 Oct 2024 12:26:47 +0200
Subject: [PATCH 13/17] test: move test_map_callrw_raw() to another file

Lets merge all map_callrw() tests into a single file.

NO_DOC=internal
---
 test/router-luatest/map_part_test.lua | 59 ++++++++++++++++++++
 test/router-luatest/router_test.lua   | 80 ---------------------------
 2 files changed, 59 insertions(+), 80 deletions(-)

diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua
index cdaccd28..b443f13a 100644
--- a/test/router-luatest/map_part_test.lua
+++ b/test/router-luatest/map_part_test.lua
@@ -417,3 +417,62 @@ g.test_map_part_callrw_raw = function(cg)
         _G.old_do_map = nil
     end)
 end
+
+g.test_map_all_callrw_raw = function(cg)
+    t.run_only_if(vutil.feature.netbox_return_raw)
+    --
+    -- Successful map.
+    --
+    local res = router_do_map(cg.router, {3}, {
+        timeout = vtest.wait_timeout,
+        return_raw = true,
+    })
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid, 3}},
+        [cg.rs2_uuid] = {{cg.rs2_uuid, 3}},
+    })
+    t.assert_equals(res.val_type, 'userdata')
+    t.assert(not res.err)
+    --
+    -- Successful map, but one of the storages returns nothing.
+    --
+    cg.replica_2_a:exec(function()
+        rawset(_G, 'old_do_map', _G.do_map)
+        _G.do_map = function()
+            return
+        end
+    end)
+    res = router_do_map(cg.router, {}, {
+        timeout = vtest.wait_timeout,
+        return_raw = true,
+    })
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid}},
+    })
+    --
+    -- Error at map stage.
+    --
+    cg.replica_2_a:exec(function()
+        _G.do_map = function()
+            return box.error(box.error.PROC_LUA, "map_err")
+        end
+    end)
+    res = router_do_map(cg.router, {}, {
+        timeout = vtest.wait_timeout,
+        return_raw = true,
+    })
+    t.assert_equals(res.val, nil)
+    t.assert_covers(res.err, {
+        code = box.error.PROC_LUA,
+        type = 'ClientError',
+        message = 'map_err'
+    }, 'error object')
+    t.assert_equals(res.err_uuid, cg.rs2_uuid, 'error uuid')
+    --
+    -- Cleanup.
+    --
+    cg.replica_2_a:exec(function()
+        _G.do_map = _G.old_do_map
+        _G.old_do_map = nil
+    end)
+end
diff --git a/test/router-luatest/router_test.lua b/test/router-luatest/router_test.lua
index 022f578c..cb4456fa 100644
--- a/test/router-luatest/router_test.lua
+++ b/test/router-luatest/router_test.lua
@@ -206,86 +206,6 @@ g.test_return_raw = function(g)
     end)
 end
 
-g.test_map_callrw_raw = function(g)
-    t.run_only_if(vutil.feature.netbox_return_raw)
-
-    local create_map_func_f = function(res1)
-        rawset(_G, 'do_map', function(res2)
-            return {res1, res2}
-        end)
-    end
-    g.replica_1_a:exec(create_map_func_f, {1})
-    g.replica_2_a:exec(create_map_func_f, {2})
-    --
-    -- Successful map.
-    --
-    local res = g.router:exec(function()
-        local val, err = ivshard.router.map_callrw(
-            'do_map', imsgpack.object({3}), {timeout = iwait_timeout,
-            return_raw = true})
-        local _, one_map = next(val)
-        return {
-            val = val,
-            map_type = type(one_map),
-            err = err,
-        }
-    end)
-    local rs1_uuid = g.replica_1_a:replicaset_uuid()
-    local rs2_uuid = g.replica_2_a:replicaset_uuid()
-    local expected = {
-        [rs1_uuid] = {{1, 3}},
-        [rs2_uuid] = {{2, 3}},
-    }
-    t.assert_equals(res.val, expected, 'map callrw success')
-    t.assert_equals(res.map_type, 'userdata', 'values are msgpacks')
-    t.assert(not res.err, 'no error')
-    --
-    -- Successful map, but one of the storages returns nothing.
-    --
-    g.replica_2_a:exec(function()
-        _G.do_map = function()
-            return
-        end
-    end)
-    res = g.router:exec(function()
-        return ivshard.router.map_callrw('do_map', {}, {timeout = iwait_timeout,
-                                         return_raw = true})
-    end)
-    expected = {
-        [rs1_uuid] = {{1}},
-    }
-    t.assert_equals(res, expected, 'map callrw without one value success')
-    --
-    -- Error at map stage.
-    --
-    g.replica_2_a:exec(function()
-        _G.do_map = function()
-            return box.error(box.error.PROC_LUA, "map_err")
-        end
-    end)
-    local err, err_uuid
-    res, err, err_uuid = g.router:exec(function()
-        return ivshard.router.map_callrw('do_map', {}, {timeout = iwait_timeout,
-                                         return_raw = true})
-    end)
-    t.assert(res == nil, 'no result')
-    t.assert_covers(err, {
-        code = box.error.PROC_LUA,
-        type = 'ClientError',
-        message = 'map_err'
-    }, 'error object')
-    t.assert_equals(err_uuid, rs2_uuid, 'error uuid')
-    --
-    -- Cleanup.
-    --
-    g.replica_1_a:exec(function()
-        _G.do_map = nil
-    end)
-    g.replica_2_a:exec(function()
-        _G.do_map = nil
-    end)
-end
-
 g.test_group_buckets = function(g)
     local bids = vtest.storage_get_n_buckets(g.replica_1_a, 2)
     local val, err = g.router:exec(function(bid1, bid2)

From ca39caca4421f7e6c9828ad0d4aef44ff838f58b Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Thu, 3 Oct 2024 12:28:17 +0200
Subject: [PATCH 14/17] test: +1 replicaset to map_callrw() tests

When there were only 2, all cases would either cover a single
replicaset or "all" of them. Lets make them 3, so that some tests
actually cover a part of a cluster which is not just a single
replicaset.

NO_DOC=internal
---
 test/router-luatest/map_part_test.lua | 30 ++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua
index b443f13a..2fd28811 100644
--- a/test/router-luatest/map_part_test.lua
+++ b/test/router-luatest/map_part_test.lua
@@ -22,8 +22,16 @@ local cfg_template = {
                 replica_2_b = {},
             },
         },
+        {
+            replicas = {
+                replica_3_a = {
+                    master = true,
+                },
+                replica_3_b = {},
+            },
+        },
     },
-    bucket_count = 10,
+    bucket_count = 30,
     test_user_grant_range = 'super',
 }
 local global_cfg = vtest.config_new(cfg_template)
@@ -76,6 +84,7 @@ g.before_all(function(cg)
     t.assert_equals(err, nil)
     cg.rs1_uuid = cg.replica_1_a:replicaset_uuid()
     cg.rs2_uuid = cg.replica_2_a:replicaset_uuid()
+    cg.rs3_uuid = cg.replica_3_a:replicaset_uuid()
 end)
 
 g.after_all(function(cg)
@@ -116,6 +125,23 @@ g.test_map_part_multi_rs = function(cg)
     })
 end
 
+g.test_map_part_all_rs = function(cg)
+    local bid1 = vtest.storage_first_bucket(cg.replica_1_a)
+    local bid2 = vtest.storage_first_bucket(cg.replica_2_a)
+    local bid3 = vtest.storage_first_bucket(cg.replica_3_a)
+    local res = router_do_map(cg.router, {123}, {
+        timeout = vtest.wait_timeout,
+        bucket_ids = {bid1, bid2, bid3},
+    })
+    t.assert_equals(res.err, nil)
+    t.assert_equals(res.err_uuid, nil)
+    t.assert_equals(res.val, {
+        [cg.rs1_uuid] = {{cg.rs1_uuid, 123}},
+        [cg.rs2_uuid] = {{cg.rs2_uuid, 123}},
+        [cg.rs3_uuid] = {{cg.rs3_uuid, 123}},
+    })
+end
+
 g.test_map_part_ref = function(cg)
     -- First move some buckets from rs1 to rs2 and then pause gc on rs1.
     -- As a result, the buckets will be in the SENT state on rs1 and
@@ -430,6 +456,7 @@ g.test_map_all_callrw_raw = function(cg)
     t.assert_equals(res.val, {
         [cg.rs1_uuid] = {{cg.rs1_uuid, 3}},
         [cg.rs2_uuid] = {{cg.rs2_uuid, 3}},
+        [cg.rs3_uuid] = {{cg.rs3_uuid, 3}},
     })
     t.assert_equals(res.val_type, 'userdata')
     t.assert(not res.err)
@@ -448,6 +475,7 @@ g.test_map_all_callrw_raw = function(cg)
     })
     t.assert_equals(res.val, {
         [cg.rs1_uuid] = {{cg.rs1_uuid}},
+        [cg.rs3_uuid] = {{cg.rs3_uuid}},
     })
     --
     -- Error at map stage.

From ead3770e6e91873380167dcc0e8517fddd603fa6 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Thu, 3 Oct 2024 13:07:52 +0200
Subject: [PATCH 15/17] storage: fix moved buckets check

'moved_buckets' function would treat as "moved" all the buckets
which are not strictly ACTIVE. But that isn't optimal.

Also the 'moved_buckets' func would assume that when ref creation
is started, by the end of it the buckets stay unchanged. That
isn't true.

Thirdly, the moved buckets could contain the destination where
did they move to. Returning this to the router would make the
re-discovery faster.

Fourthly, PINNED buckets were not considered ACTIVE.

The commit fixes all these issues.

Firstly, when a bucket is SENDING, returning an error right away
isn't good. The router would just keep retrying then, without any
progress. The bucket is actually here, it is not moved yet.

Better let the storage try to take a ref. Then one of 2 results
are possible:
- It waits without useless active retries. And then SENDING fails
    and becomes ACTIVE. Ref is taken, all good.
- It waits without useless active retries. SENDING turns into
    SENT. Ref is taken for the other buckets, and this one is
    reported as moved.

Similar logic applies to RECEIVING.

Secondly, after a ref is taken, the not-moved buckets could become
moved. Need to re-check them before returning the ref. Luckily,
the storage can use bucket_generation to avoid this double-check
when nothing changed in _bucket.

NO_DOC=bugfix
---
 test/storage-luatest/storage_1_test.lua | 292 ++++++++++++++++++++++--
 vshard/router/init.lua                  |  14 +-
 vshard/storage/init.lua                 |  50 ++--
 3 files changed, 321 insertions(+), 35 deletions(-)

diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua
index 0e8d8bd4..1056f2ac 100644
--- a/test/storage-luatest/storage_1_test.lua
+++ b/test/storage-luatest/storage_1_test.lua
@@ -226,7 +226,7 @@ test_group.test_bucket_skip_validate = function(g)
     end)
 end
 
-test_group.test_ref_with_buckets = function(g)
+test_group.test_ref_with_buckets_basic = function(g)
     g.replica_1_a:exec(function()
         local lref = require('vshard.storage.ref')
         local res, err, _
@@ -277,7 +277,11 @@ test_group.test_ref_with_buckets = function(g)
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {
             rid = rid,
-            moved = {bucket_count + 1, bucket_count + 2, bucket_count + 3}
+            moved = {
+                {id = bucket_count + 1},
+                {id = bucket_count + 2},
+                {id = bucket_count + 3},
+            }
         })
         _, err = ivshard.storage._call('storage_unref', rid)
         ilt.assert_equals(err, nil)
@@ -290,40 +294,296 @@ test_group.test_ref_with_buckets = function(g)
             {bucket_count + 1, bucket_count + 2}
         )
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}})
+        ilt.assert_equals(res, {moved = {
+            {id = bucket_count + 1},
+            {id = bucket_count + 2},
+        }})
         ilt.assert_equals(lref.count, 0)
+    end)
+end
 
+test_group.test_ref_with_buckets_timeout = function(g)
+    g.replica_1_a:exec(function()
+        local lref = require('vshard.storage.ref')
+        local rid = 42
+        local bids = _G.get_n_buckets(2)
+        --
         -- Timeout when some buckets aren't writable. Doesn't have to be the
         -- same buckets as for moving.
+        --
         _G.bucket_recovery_pause()
         box.space._bucket:update(
             {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}})
-        res, err = ivshard.storage._call(
+        local res, err = ivshard.storage._call(
             'storage_ref_with_buckets', rid, 0.01, {bids[2]})
         box.space._bucket:update(
             {bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
-        _G.bucket_recovery_continue()
         t.assert_str_contains(err.message, 'Timeout exceeded')
         ilt.assert_equals(res, nil)
         ilt.assert_equals(lref.count, 0)
+        _G.bucket_recovery_continue()
     end)
 end
 
-test_group.test_moved_buckets = function(g)
+test_group.test_ref_with_buckets_return_last_known_dst = function(g)
     g.replica_1_a:exec(function()
+        local lref = require('vshard.storage.ref')
+        local rid = 42
+        local bid = _G.get_first_bucket()
+        local luuid = require('uuid')
+        local id = luuid.str()
+        -- Make the bucket follow the correct state sequence. Another way is
+        -- validated and not allowed.
+        _G.bucket_recovery_pause()
+        _G.bucket_gc_pause()
+        box.space._bucket:update(
+            {bid}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}})
+        box.space._bucket:update(
+            {bid}, {{'=', 2, ivconst.BUCKET.SENT}})
         local res, err = ivshard.storage._call(
-            'moved_buckets',
-            {_G.get_first_bucket()}
-        )
+            'storage_ref_with_buckets', rid, iwait_timeout, {bid})
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {moved = {}})
+        ilt.assert_equals(res, {moved = {{
+            id = bid,
+            dst = id,
+            status = ivconst.BUCKET.SENT,
+        }}})
+        ilt.assert_equals(lref.count, 0)
+        -- Cleanup.
+        _G.bucket_gc_continue()
+        _G.bucket_gc_wait()
+        box.space._bucket:insert({bid, ivconst.BUCKET.RECEIVING})
+        box.space._bucket:update({bid}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        _G.bucket_recovery_continue()
+    end)
+end
 
-        local bucket_count = ivshard.storage.internal.total_bucket_count
-        res, err = ivshard.storage._call(
-            'moved_buckets',
-            {_G.get_first_bucket(), bucket_count + 1, bucket_count + 2}
-        )
+test_group.test_ref_with_buckets_move_part_while_referencing = function(g)
+    g.replica_1_a:exec(function()
+        local lref = require('vshard.storage.ref')
+        local _
+        local rid = 42
+        local bids = _G.get_n_buckets(3)
+        local luuid = require('uuid')
+        local id = luuid.str()
+        --
+        -- Was not moved until ref, but moved while ref was waiting.
+        --
+        _G.bucket_recovery_pause()
+        _G.bucket_gc_pause()
+        -- Block the refs for a while.
+        box.space._bucket:update(
+            {bids[3]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}})
+        -- Start referencing.
+        local session_id
+        local f = ifiber.new(function()
+            session_id = box.session.id()
+            return ivshard.storage._call('storage_ref_with_buckets', rid,
+                                         iwait_timeout, {bids[1], bids[2]})
+        end)
+        f:set_joinable(true)
+        -- While waiting, one of the target buckets starts moving.
+        box.space._bucket:update(
+            {bids[2]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}})
+        -- Now they are moved.
+        box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.SENT}})
+        box.space._bucket:update({bids[3]}, {{'=', 2, ivconst.BUCKET.SENT}})
+        _G.bucket_gc_continue()
+        _G.bucket_gc_wait()
+        local ok, res, err = f:join()
+        t.assert(ok)
+        t.assert_equals(err, nil)
+        ilt.assert_equals(res, {
+            moved = {{id = bids[2], dst = id}},
+            rid = rid,
+        })
+        -- Ref was done, because at least one bucket was ok.
+        ilt.assert_equals(lref.count, 1)
+        -- Cleanup.
+        _, err = lref.del(rid, session_id)
+        ilt.assert_equals(err, nil)
+        ilt.assert_equals(lref.count, 0)
+        box.space._bucket:insert({bids[2], ivconst.BUCKET.RECEIVING})
+        box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        box.space._bucket:insert({bids[3], ivconst.BUCKET.RECEIVING})
+        box.space._bucket:update({bids[3]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        _G.bucket_recovery_continue()
+    end)
+end
+
+test_group.test_ref_with_buckets_move_all_while_referencing = function(g)
+    g.replica_1_a:exec(function()
+        local lref = require('vshard.storage.ref')
+        local rid = 42
+        local bids = _G.get_n_buckets(3)
+        local luuid = require('uuid')
+        local id = luuid.str()
+        --
+        -- Now same, but all buckets were moved. No ref should be left then.
+        --
+        _G.bucket_recovery_pause()
+        _G.bucket_gc_pause()
+        -- Block the refs for a while.
+        box.space._bucket:update(
+            {bids[3]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}})
+        -- Start referencing.
+        local f = ifiber.new(function()
+            return ivshard.storage._call('storage_ref_with_buckets', rid,
+                                         iwait_timeout, {bids[1], bids[2]})
+        end)
+        f:set_joinable(true)
+        -- While waiting, all the target buckets start moving.
+        box.space._bucket:update(
+            {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}})
+        box.space._bucket:update(
+            {bids[2]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}})
+        -- Now they are moved.
+        box.space._bucket:update({bids[1]}, {{'=', 2, ivconst.BUCKET.SENT}})
+        box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.SENT}})
+        -- And the other bucket doesn't matter. Can revert it back.
+        box.space._bucket:update({bids[3]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        _G.bucket_gc_continue()
+        _G.bucket_gc_wait()
+        local ok, res, err = f:join()
+        t.assert(ok)
+        t.assert_equals(err, nil)
+        ilt.assert_equals(res, {
+            moved = {
+                {id = bids[1], dst = id},
+                {id = bids[2], dst = id},
+            }
+        })
+        -- Ref was not done, because all the buckets moved out.
+        ilt.assert_equals(lref.count, 0)
+        -- Cleanup.
+        box.space._bucket:insert({bids[1], ivconst.BUCKET.RECEIVING})
+        box.space._bucket:update({bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        box.space._bucket:insert({bids[2], ivconst.BUCKET.RECEIVING})
+        box.space._bucket:update({bids[2]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        _G.bucket_recovery_continue()
+    end)
+end
+
+test_group.test_moved_buckets_various_statuses = function(g)
+    g.replica_1_a:exec(function()
+        local _bucket = box.space._bucket
+        -- Make sure that if any bucket status is added/deleted/changed, the
+        -- test gets an update.
+        ilt.assert_equals(ivconst.BUCKET, {
+            ACTIVE = 'active',
+            PINNED = 'pinned',
+            SENDING = 'sending',
+            SENT = 'sent',
+            RECEIVING = 'receiving',
+            GARBAGE = 'garbage',
+        })
+        _G.bucket_recovery_pause()
+        _G.bucket_gc_pause()
+        local luuid = require('uuid')
+        -- +1 to delete and make it a 404 bucket.
+        local bids = _G.get_n_buckets(7)
+
+        -- ACTIVE = bids[1].
+        --
+        -- PINNED = bids[2].
+        local bid_pinned = bids[2]
+        _bucket:update({bid_pinned},
+            {{'=', 2, ivconst.BUCKET.PINNED}})
+
+        -- SENDING = bids[3].
+        local bid_sending = bids[3]
+        local id_sending = luuid.str()
+        _bucket:update({bid_sending},
+            {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id_sending}})
+
+        -- SENT = bids[4].
+        local bid_sent = bids[4]
+        local id_sent = luuid.str()
+        _bucket:update({bid_sent},
+            {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id_sent}})
+        _bucket:update({bid_sent},
+            {{'=', 2, ivconst.BUCKET.SENT}})
+
+        -- RECEIVING = bids[5].
+        local bid_receiving = bids[5]
+        local id_receiving = luuid.str()
+        _bucket:update({bid_receiving},
+            {{'=', 2, ivconst.BUCKET.SENDING}})
+        _bucket:update({bid_receiving},
+            {{'=', 2, ivconst.BUCKET.SENT}})
+        _bucket:update({bid_receiving},
+            {{'=', 2, ivconst.BUCKET.GARBAGE}})
+        _bucket:delete({bid_receiving})
+        _bucket:insert({bid_receiving, ivconst.BUCKET.RECEIVING, id_receiving})
+
+        -- GARBAGE = bids[6].
+        local bid_garbage = bids[6]
+        local id_garbage = luuid.str()
+        _bucket:update({bid_garbage},
+            {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id_garbage}})
+        _bucket:update({bid_garbage},
+            {{'=', 2, ivconst.BUCKET.SENT}})
+        _bucket:update({bid_garbage},
+            {{'=', 2, ivconst.BUCKET.GARBAGE}})
+
+        -- NOT EXISTING = bids[7].
+        local bid_404 = bids[7]
+        _bucket:update({bid_404},
+            {{'=', 2, ivconst.BUCKET.SENDING}})
+        _bucket:update({bid_404},
+            {{'=', 2, ivconst.BUCKET.SENT}})
+        _bucket:update({bid_404},
+            {{'=', 2, ivconst.BUCKET.GARBAGE}})
+        _bucket:delete({bid_404})
+
+        local res, err = ivshard.storage._call('moved_buckets', bids)
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {moved = {bucket_count + 1, bucket_count + 2}})
+        ilt.assert(res and res.moved)
+        ilt.assert_items_equals(res.moved, {
+            {
+                id = bid_sent,
+                dst = id_sent,
+                status = ivconst.BUCKET.SENT,
+            },
+            {
+                id = bid_garbage,
+                dst = id_garbage,
+                status = ivconst.BUCKET.GARBAGE,
+            },
+            {
+                id = bid_404,
+            }
+        })
+        --
+        -- Cleanup.
+        --
+        -- NOT EXISTING.
+        _bucket:insert({bid_404, ivconst.BUCKET.RECEIVING})
+        _bucket:update({bid_404},
+            {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        -- GARBAGE.
+        _bucket:delete({bid_garbage})
+        _bucket:insert({bid_garbage, ivconst.BUCKET.RECEIVING})
+        _bucket:update({bid_garbage},
+            {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        -- RECEIVING.
+        _bucket:update({bid_receiving},
+            {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        -- SENT.
+        _bucket:update({bid_sent},
+            {{'=', 2, ivconst.BUCKET.GARBAGE}})
+        _bucket:delete({bid_sent})
+        _bucket:insert({bid_sent, ivconst.BUCKET.RECEIVING})
+        _bucket:update({bid_sent},
+            {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        -- SENDING.
+        _bucket:update({bid_sending},
+            {{'=', 2, ivconst.BUCKET.ACTIVE}})
+        -- PINNED.
+        _bucket:update({bid_pinned},
+            {{'=', 2, ivconst.BUCKET.ACTIVE}})
+
+        _G.bucket_recovery_continue()
+        _G.bucket_gc_continue()
     end)
 end
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 0430b596..2986c9e9 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -947,9 +947,17 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout)
                 err_id = id
                 goto fail
             end
-            for _, bucket_id in pairs(res.moved) do
-                bucket_reset(router, bucket_id)
-                table.insert(bucket_ids, bucket_id)
+            for _, bucket in pairs(res.moved) do
+                local bid = bucket.id
+                local dst = bucket.dst
+                -- 'Reset' regardless of 'set'. So as not to
+                -- bother with 'set' errors. If it fails, then
+                -- won't matter. It is a best-effort thing.
+                bucket_reset(router, bid)
+                if dst ~= nil then
+                    bucket_set(router, bid, dst)
+                end
+                table.insert(bucket_ids, bid)
             end
             if res.rid then
                 assert(not replicasets_to_map[id])
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index a8fa3a07..f49b564d 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -3181,24 +3181,40 @@ local function storage_ref(rid, timeout)
 end
 
 --
--- Lookup for absent active buckets.
---
--- @param bucket_ids List of bucket identifiers.
--- @return A dictionary with a list of bucket identifiers which are
---         not present on the current instance.
+-- Lookup buckets which are definitely not going to recover into ACTIVE state
+-- under any circumstances.
 --
 local function storage_moved_buckets(bucket_ids)
-    local status = consts.BUCKET
+    local allstatus = consts.BUCKET
     local moved_buckets = {}
     for _, bucket_id in pairs(bucket_ids) do
         local bucket = box.space._bucket:get{bucket_id}
-        if not bucket or bucket.status ~= status.ACTIVE then
-            table.insert(moved_buckets, bucket_id)
+        local is_moved
+        if not bucket then
+            is_moved = true
+        else
+            local status = bucket.status
+            is_moved = status == allstatus.GARBAGE or status == allstatus.SENT
+        end
+        if is_moved then
+            table.insert(moved_buckets, {
+                id = bucket_id,
+                dst = bucket and bucket.destination or M.route_map[bucket_id],
+                status = bucket and bucket.status,
+            })
         end
     end
     return { moved = moved_buckets }
 end
 
+--
+-- Drop a storage ref from the current box session. Is used as a part of
+-- Map-Reduce API.
+--
+local function storage_unref(rid)
+    return lref.del(rid, box.session.id())
+end
+
 --
 -- Bind a new storage ref to the current box session and check
 -- that all the buckets are present. Is used as a part of the
@@ -3219,21 +3235,23 @@ local function storage_ref_with_buckets(rid, timeout, bucket_ids)
         -- ref.
         return {rid = nil, moved = moved}
     end
+    local bucket_generation = M.bucket_generation
     local ok, err = storage_ref(rid, timeout)
     if not ok then
         return nil, err
     end
+    if M.bucket_generation ~= bucket_generation then
+        -- Need to redo it. Otherwise there is a risk that some buckets were
+        -- moved while waiting for the ref.
+        moved = storage_moved_buckets(bucket_ids).moved
+        if #moved == #bucket_ids then
+            storage_unref(rid)
+            rid = nil
+        end
+    end
     return {rid = rid, moved = moved}
 end
 
---
--- Drop a storage ref from the current box session. Is used as a part of
--- Map-Reduce API.
---
-local function storage_unref(rid)
-    return lref.del(rid, box.session.id())
-end
-
 --
 -- Execute a user's function under an infinite storage ref protecting from
 -- bucket moves. The ref should exist before, and is deleted after, regardless

From f316ad1b6daf80edb64ac99c91c9175cb45d87b9 Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Wed, 9 Oct 2024 23:58:02 +0200
Subject: [PATCH 16/17] storage: fix moved buckets ref check

During the partial Map-Reduce the router might visit some storages
more than once. Happens when after a ref on storage-A another
storage-B reports A as having taken some buckets.

Then router would come back to A to confirm that. The storage
still must hold its previously created ref in order for such
checks to make any sense. Otherwise any of the previously
confirmed buckets could have had escaped by now.

Without the ref-checking the router could reach the Map stage and
send some Map requests even though could detect earlier, that not
all storages would succeed.

This wasn't strictly speaking a bug, but it was clearly suboptimal
behaviour leading to the requests being executed not on all the
needed storages while the others would report errors.

NO_DOC=internal
---
 test/router-luatest/map_part_test.lua   | 23 ++++++++++++-
 test/storage-luatest/storage_1_test.lua | 38 +++++++++++-----------
 test/upgrade/upgrade.result             |  4 +--
 vshard/router/init.lua                  | 10 +++---
 vshard/storage/init.lua                 | 43 ++++++++++++++++++-------
 5 files changed, 80 insertions(+), 38 deletions(-)

diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_part_test.lua
index 2fd28811..2de0291c 100644
--- a/test/router-luatest/map_part_test.lua
+++ b/test/router-luatest/map_part_test.lua
@@ -267,6 +267,18 @@ g.test_map_part_ref_timeout = function(cg)
         end
     end, {{bid1, bid2, bid3, bid4}})
 
+    -- Count the map calls. The loss of ref must be detected before the
+    -- map-stage.
+    local _, err = vtest.cluster_exec_each_master(cg, function()
+        rawset(_G, 'old_do_map', _G.do_map)
+        rawset(_G, 'map_count', 0)
+        _G.do_map = function(...)
+            _G.map_count = _G.map_count + 1
+            return _G.old_do_map(...)
+        end
+    end)
+    t.assert_equals(err, nil)
+
     -- Send bucket so the router thinks:
     --     rs1: {b1, b2}, rs2: {b3, b4}
     -- and actually the state is:
@@ -324,11 +336,20 @@ g.test_map_part_ref_timeout = function(cg)
     t.assert_equals(res.err_uuid, cg.rs2_uuid)
 
     -- Make sure there are no references left.
-    local _, err = vtest.cluster_exec_each(cg, function()
+    _, err = vtest.cluster_exec_each(cg, function()
         ilt.assert_equals(require('vshard.storage.ref').count, 0)
     end)
     t.assert_equals(err, nil)
 
+    -- No maps had a chance to get executed.
+    _, err = vtest.cluster_exec_each_master(cg, function()
+        ilt.assert_equals(_G.map_count, 0)
+        _G.do_map = _G.old_do_map
+        _G.old_do_map = nil
+        _G.map_count = nil
+    end)
+    t.assert_equals(err, nil)
+
     -- Return the bucket back and re-enable discovery on the router.
     cg.replica_2_a:exec(function(bid, to)
         _G.bucket_send(bid, to)
diff --git a/test/storage-luatest/storage_1_test.lua b/test/storage-luatest/storage_1_test.lua
index 1056f2ac..1e1600b3 100644
--- a/test/storage-luatest/storage_1_test.lua
+++ b/test/storage-luatest/storage_1_test.lua
@@ -236,16 +236,16 @@ test_group.test_ref_with_buckets_basic = function(g)
 
         -- No buckets.
         res, err = ivshard.storage._call(
-            'storage_ref_with_buckets', rid, iwait_timeout, {})
+            'storage_ref_make_with_buckets', rid, iwait_timeout, {})
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {moved = {}})
         ilt.assert_equals(lref.count, 0)
 
         -- Check for a single ok bucket.
         res, err = ivshard.storage._call(
-            'storage_ref_with_buckets', rid, iwait_timeout, {bids[1]})
+            'storage_ref_make_with_buckets', rid, iwait_timeout, {bids[1]})
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {rid = rid, moved = {}})
+        ilt.assert_equals(res, {is_done = true, moved = {}})
         ilt.assert_equals(lref.count, 1)
         _, err = ivshard.storage._call('storage_unref', rid)
         ilt.assert_equals(err, nil)
@@ -253,17 +253,19 @@ test_group.test_ref_with_buckets_basic = function(g)
 
         -- Check for multiple ok buckets.
         res, err = ivshard.storage._call(
-            'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[2]})
+            'storage_ref_make_with_buckets', rid, iwait_timeout,
+            {bids[1], bids[2]})
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {rid = rid, moved = {}})
+        ilt.assert_equals(res, {is_done = true, moved = {}})
         _, err = ivshard.storage._call('storage_unref', rid)
         ilt.assert_equals(err, nil)
 
         -- Check for double referencing.
         res, err = ivshard.storage._call(
-            'storage_ref_with_buckets', rid, iwait_timeout, {bids[1], bids[1]})
+            'storage_ref_make_with_buckets', rid, iwait_timeout,
+            {bids[1], bids[1]})
         ilt.assert_equals(err, nil)
-        ilt.assert_equals(res, {rid = rid, moved = {}})
+        ilt.assert_equals(res, {is_done = true, moved = {}})
         ilt.assert_equals(lref.count, 1)
         _, err = ivshard.storage._call('storage_unref', rid)
         ilt.assert_equals(err, nil)
@@ -271,12 +273,12 @@ test_group.test_ref_with_buckets_basic = function(g)
 
         -- Bucket mix.
         res, err = ivshard.storage._call(
-            'storage_ref_with_buckets', rid, iwait_timeout,
+            'storage_ref_make_with_buckets', rid, iwait_timeout,
             {bucket_count + 1, bids[1], bucket_count + 2, bids[2],
              bucket_count + 3})
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {
-            rid = rid,
+            is_done = true,
             moved = {
                 {id = bucket_count + 1},
                 {id = bucket_count + 2},
@@ -288,7 +290,7 @@ test_group.test_ref_with_buckets_basic = function(g)
 
         -- No ref when all buckets are missing.
         res, err = ivshard.storage._call(
-            'storage_ref_with_buckets',
+            'storage_ref_make_with_buckets',
             rid,
             iwait_timeout,
             {bucket_count + 1, bucket_count + 2}
@@ -315,7 +317,7 @@ test_group.test_ref_with_buckets_timeout = function(g)
         box.space._bucket:update(
             {bids[1]}, {{'=', 2, ivconst.BUCKET.SENDING}})
         local res, err = ivshard.storage._call(
-            'storage_ref_with_buckets', rid, 0.01, {bids[2]})
+            'storage_ref_make_with_buckets', rid, 0.01, {bids[2]})
         box.space._bucket:update(
             {bids[1]}, {{'=', 2, ivconst.BUCKET.ACTIVE}})
         t.assert_str_contains(err.message, 'Timeout exceeded')
@@ -341,7 +343,7 @@ test_group.test_ref_with_buckets_return_last_known_dst = function(g)
         box.space._bucket:update(
             {bid}, {{'=', 2, ivconst.BUCKET.SENT}})
         local res, err = ivshard.storage._call(
-            'storage_ref_with_buckets', rid, iwait_timeout, {bid})
+            'storage_ref_make_with_buckets', rid, iwait_timeout, {bid})
         ilt.assert_equals(err, nil)
         ilt.assert_equals(res, {moved = {{
             id = bid,
@@ -378,7 +380,7 @@ test_group.test_ref_with_buckets_move_part_while_referencing = function(g)
         local session_id
         local f = ifiber.new(function()
             session_id = box.session.id()
-            return ivshard.storage._call('storage_ref_with_buckets', rid,
+            return ivshard.storage._call('storage_ref_make_with_buckets', rid,
                                          iwait_timeout, {bids[1], bids[2]})
         end)
         f:set_joinable(true)
@@ -395,7 +397,7 @@ test_group.test_ref_with_buckets_move_part_while_referencing = function(g)
         t.assert_equals(err, nil)
         ilt.assert_equals(res, {
             moved = {{id = bids[2], dst = id}},
-            rid = rid,
+            is_done = true,
         })
         -- Ref was done, because at least one bucket was ok.
         ilt.assert_equals(lref.count, 1)
@@ -428,7 +430,7 @@ test_group.test_ref_with_buckets_move_all_while_referencing = function(g)
             {bids[3]}, {{'=', 2, ivconst.BUCKET.SENDING}, {'=', 3, id}})
         -- Start referencing.
         local f = ifiber.new(function()
-            return ivshard.storage._call('storage_ref_with_buckets', rid,
+            return ivshard.storage._call('storage_ref_make_with_buckets', rid,
                                          iwait_timeout, {bids[1], bids[2]})
         end)
         f:set_joinable(true)
@@ -536,10 +538,8 @@ test_group.test_moved_buckets_various_statuses = function(g)
             {{'=', 2, ivconst.BUCKET.GARBAGE}})
         _bucket:delete({bid_404})
 
-        local res, err = ivshard.storage._call('moved_buckets', bids)
-        ilt.assert_equals(err, nil)
-        ilt.assert(res and res.moved)
-        ilt.assert_items_equals(res.moved, {
+        local moved = ivshard.storage.internal.bucket_get_moved(bids)
+        ilt.assert_items_equals(moved, {
             {
                 id = bid_sent,
                 dst = id_sent,
diff --git a/test/upgrade/upgrade.result b/test/upgrade/upgrade.result
index 5e8e71f5..e0c9469b 100644
--- a/test/upgrade/upgrade.result
+++ b/test/upgrade/upgrade.result
@@ -174,13 +174,13 @@ vshard.storage._call('test_api', 1, 2, 3)
  | - - bucket_recv
  |   - bucket_test_gc
  |   - info
- |   - moved_buckets
  |   - rebalancer_apply_routes
  |   - rebalancer_request_state
  |   - recovery_bucket_stat
  |   - storage_map
  |   - storage_ref
- |   - storage_ref_with_buckets
+ |   - storage_ref_check_with_buckets
+ |   - storage_ref_make_with_buckets
  |   - storage_unref
  |   - test_api
  | - 1
diff --git a/vshard/router/init.lua b/vshard/router/init.lua
index 2986c9e9..6c2dc14a 100644
--- a/vshard/router/init.lua
+++ b/vshard/router/init.lua
@@ -919,9 +919,11 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout)
             if replicasets_to_map[id] then
                 -- Replicaset is already referenced on a previous iteration.
                 -- Simply get the moved buckets without double referencing.
-                args_ref = {'moved_buckets', buckets}
+                args_ref = {
+                    'storage_ref_check_with_buckets', rid, buckets}
             else
-                args_ref = {'storage_ref_with_buckets', rid, timeout, buckets}
+                args_ref = {
+                    'storage_ref_make_with_buckets', rid, timeout, buckets}
             end
             res, err = replicasets_all[id]:callrw('vshard.storage._call',
                                                   args_ref, opts_async)
@@ -941,7 +943,7 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout)
                 err_id = id
                 goto fail
             end
-            -- Ref returns nil,err or {rid, moved}.
+            -- Ref returns nil,err or {is_done, moved}.
             res, err = res[1], res[2]
             if res == nil then
                 err_id = id
@@ -959,7 +961,7 @@ local function router_ref_storage_by_buckets(router, bucket_ids, timeout)
                 end
                 table.insert(bucket_ids, bid)
             end
-            if res.rid then
+            if res.is_done then
                 assert(not replicasets_to_map[id])
                 -- If there are no buckets on the replicaset, it would not be
                 -- referenced.
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
index f49b564d..1c174663 100644
--- a/vshard/storage/init.lua
+++ b/vshard/storage/init.lua
@@ -3184,9 +3184,9 @@ end
 -- Lookup buckets which are definitely not going to recover into ACTIVE state
 -- under any circumstances.
 --
-local function storage_moved_buckets(bucket_ids)
+local function bucket_get_moved(bucket_ids)
     local allstatus = consts.BUCKET
-    local moved_buckets = {}
+    local res = {}
     for _, bucket_id in pairs(bucket_ids) do
         local bucket = box.space._bucket:get{bucket_id}
         local is_moved
@@ -3197,14 +3197,14 @@ local function storage_moved_buckets(bucket_ids)
             is_moved = status == allstatus.GARBAGE or status == allstatus.SENT
         end
         if is_moved then
-            table.insert(moved_buckets, {
+            table.insert(res, {
                 id = bucket_id,
                 dst = bucket and bucket.destination or M.route_map[bucket_id],
                 status = bucket and bucket.status,
             })
         end
     end
-    return { moved = moved_buckets }
+    return res
 end
 
 --
@@ -3228,12 +3228,12 @@ end
 --         are absent, the reference is not created and a nil reference id
 --         with the list of absent buckets is returned.
 --
-local function storage_ref_with_buckets(rid, timeout, bucket_ids)
-    local moved = storage_moved_buckets(bucket_ids).moved
+local function storage_ref_make_with_buckets(rid, timeout, bucket_ids)
+    local moved = bucket_get_moved(bucket_ids)
     if #moved == #bucket_ids then
         -- If all the passed buckets are absent, there is no need to create a
         -- ref.
-        return {rid = nil, moved = moved}
+        return {moved = moved}
     end
     local bucket_generation = M.bucket_generation
     local ok, err = storage_ref(rid, timeout)
@@ -3243,13 +3243,31 @@ local function storage_ref_with_buckets(rid, timeout, bucket_ids)
     if M.bucket_generation ~= bucket_generation then
         -- Need to redo it. Otherwise there is a risk that some buckets were
         -- moved while waiting for the ref.
-        moved = storage_moved_buckets(bucket_ids).moved
+        moved = bucket_get_moved(bucket_ids)
         if #moved == #bucket_ids then
             storage_unref(rid)
-            rid = nil
+            return {moved = moved}
         end
     end
-    return {rid = rid, moved = moved}
+    return {is_done = true, moved = moved}
+end
+
+--
+-- Check which buckets from the given list are moved out of this storage, while
+-- also making sure that the storage-ref is still in place.
+--
+-- Partial Map-Reduce makes a ref on the storages having any of the needed
+-- buckets, but then can come back if other storages report the already visited
+-- ones as having the needed buckets. Only makes sense to check, if this storage
+-- still holds the ref. Otherwise its previous guarantees given during the ref
+-- creation are all gone.
+--
+local function storage_ref_check_with_buckets(rid, bucket_ids)
+    local ok, err = lref.check(rid, box.session.id())
+    if not ok then
+        return nil, err
+    end
+    return {moved = bucket_get_moved(bucket_ids)}
 end
 
 --
@@ -3298,9 +3316,9 @@ service_call_api = setmetatable({
     rebalancer_apply_routes = rebalancer_apply_routes,
     rebalancer_request_state = rebalancer_request_state,
     recovery_bucket_stat = recovery_bucket_stat,
-    moved_buckets = storage_moved_buckets,
     storage_ref = storage_ref,
-    storage_ref_with_buckets = storage_ref_with_buckets,
+    storage_ref_make_with_buckets = storage_ref_make_with_buckets,
+    storage_ref_check_with_buckets = storage_ref_check_with_buckets,
     storage_unref = storage_unref,
     storage_map = storage_map,
     info = storage_service_info,
@@ -4182,6 +4200,7 @@ M.bucket_state_edges = bucket_state_edges
 
 M.bucket_are_all_rw = bucket_are_all_rw_public
 M.bucket_generation_wait = bucket_generation_wait
+M.bucket_get_moved = bucket_get_moved
 lregistry.storage = M
 
 --

From 08aa3804b28276bdfb2611971fa969af89123c2b Mon Sep 17 00:00:00 2001
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Date: Thu, 3 Oct 2024 13:08:35 +0200
Subject: [PATCH 17/17] test: rename map_part_test to map_callrw_test

It tests not only partial Map-Reduce. It covers a bit of the full
one as well.

NO_DOC=internal
---
 test/router-luatest/{map_part_test.lua => map_callrw_test.lua} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/router-luatest/{map_part_test.lua => map_callrw_test.lua} (100%)

diff --git a/test/router-luatest/map_part_test.lua b/test/router-luatest/map_callrw_test.lua
similarity index 100%
rename from test/router-luatest/map_part_test.lua
rename to test/router-luatest/map_callrw_test.lua