Skip to content

Commit

Permalink
Fix issue when init nodes are used
Browse files Browse the repository at this point in the history
Includes a new testcase simulating that all nodes are unreachable.

Signed-off-by: Björn Svensson <[email protected]>
  • Loading branch information
bjosv committed Oct 30, 2024
1 parent 49983bf commit ef2556e
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 4 deletions.
8 changes: 4 additions & 4 deletions src/ered_cluster.erl
Original file line number Diff line number Diff line change
Expand Up @@ -664,8 +664,8 @@ start_clients(Addrs, State) ->
{State#st.nodes, State#st.closing},
Addrs),

State#st{nodes = maps:merge(State#st.nodes, NewNodes),
pending = sets:union(State#st.pending,
sets:subtract(new_set(maps:keys(NewNodes)),
State#st.up)),
NewPending = sets:union(State#st.pending, sets:subtract(new_set(maps:keys(NewNodes)),
new_set(maps:keys(State#st.nodes)))),
State#st{nodes = NewNodes,
pending = NewPending,
closing = NewClosing}.
58 changes: 58 additions & 0 deletions test/ered_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ all() ->
t_manual_failover,
t_manual_failover_then_old_master_down,
t_blackhole,
t_blackhole_all_nodes,
t_init_timeout,
t_empty_slotmap,
t_empty_initial_slotmap,
Expand Down Expand Up @@ -467,6 +468,63 @@ t_blackhole(_) ->

no_more_msgs().

t_blackhole_all_nodes(_) ->
%% Simulate that all nodes are unreachable, e.g. a network failure. We use
%% 'docket pause', similar to sending SIGSTOP to a process, to make the
%% nodes unresponsive. This makes TCP recv() and connect() time out.
CloseWait = 2000, % default is 10000
NodeDownTimeout = 2000, % default is 2000
ResponseTimeout = 10000, % default is 10000
R = start_cluster([{close_wait, CloseWait},
%% Require replicas for 'cluster OK'.
{min_replicas, 1},
{client_opts,
[{node_down_timeout, NodeDownTimeout},
{connection_opts,
[{response_timeout, ResponseTimeout}]}]}
]),

%% Pause all nodes
lists:foreach(fun(Port) ->
Pod = get_pod_name_from_port(Port),
ct:pal("Pausing container: " ++ os:cmd("docker pause " ++ Pod))
end, ?PORTS),

%% Send PING to all nodes and expect closed sockets, error replies for sent requests,
%% and a report that the cluster is not ok.
TestPid = self(),
AddrToPid = ered:get_addr_to_client_map(R),
maps:foreach(fun(_ClientAddr, ClientPid) ->
ered:command_client_async(ClientPid, [<<"PING">>],
fun(Reply) -> TestPid ! {ping_reply, Reply} end)
end, AddrToPid),

[?MSG(#{msg_type := socket_closed, reason := {recv_exit, timeout}, addr := {"127.0.0.1", Port}},
ResponseTimeout + 1000) || Port <- ?PORTS],
?MSG({ping_reply, {error, _Reason1}}, NodeDownTimeout + 1000),
?MSG({ping_reply, {error, _Reason2}}, NodeDownTimeout + 1000),
?MSG({ping_reply, {error, _Reason3}}, NodeDownTimeout + 1000),
?MSG({ping_reply, {error, _Reason4}}, NodeDownTimeout + 1000),
?MSG({ping_reply, {error, _Reason5}}, NodeDownTimeout + 1000),
?MSG({ping_reply, {error, _Reason6}}, NodeDownTimeout + 1000),
[?MSG(#{msg_type := node_down_timeout, addr := {"127.0.0.1", Port}}) || Port <- ?PORTS],
?MSG(#{msg_type := cluster_not_ok, reason := master_down}),

%% Unpause all nodes
lists:foreach(fun(Port) ->
Pod = get_pod_name_from_port(Port),
ct:pal("Unpausing container: " ++ os:cmd("docker unpause " ++ Pod))
end, ?PORTS),
timer:sleep(500),

wait_for_consistent_cluster(),

%% Expect connects and a cluster ok.
[?MSG(#{msg_type := connected, addr := {"127.0.0.1", Port}}, 10000) || Port <- ?PORTS],
?MSG(#{msg_type := cluster_ok}, 10000),

no_more_msgs().


t_init_timeout(_) ->
Opts = [
Expand Down

0 comments on commit ef2556e

Please sign in to comment.