Skip to content

Commit

Permalink
ra_checkpoint_SUITE: Add a case testing corrupt checkpoint promotion
Browse files Browse the repository at this point in the history
  • Loading branch information
the-mikedavis committed Jul 25, 2024
1 parent 2bc7c16 commit 2cb471c
Showing 1 changed file with 48 additions and 1 deletion.
49 changes: 48 additions & 1 deletion test/ra_checkpoint_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ all_tests() ->
recover_from_checkpoint_and_snapshot,
newer_snapshot_deletes_older_checkpoints,
init_recover_corrupt,
init_recover_multi_corrupt
init_recover_multi_corrupt,
promote_corrupt
].

groups() ->
Expand Down Expand Up @@ -341,6 +342,52 @@ init_recover_multi_corrupt(Config) ->

ok.

promote_corrupt(Config) ->
%% A checkpoint might not be valid when we attempt to promote it.
%% Only the first, latest valid checkpoint is checked for validity.
%% Other checkpoints have their validity checked lazily at promotion
%% time. This case checks that we handle the validation failure gracefully.
State0 = init_state(Config),

%% Take a checkpoint.
Meta1 = meta(55, 2, [node()]),
{State1, _} = ra_snapshot:begin_snapshot(Meta1, ?FUNCTION_NAME, checkpoint, State0),
State2 = receive
{ra_log_event, {snapshot_written, {55, 2} = IdxTerm1, checkpoint}} ->
ra_snapshot:complete_snapshot(IdxTerm1, checkpoint, State1)
after 1000 ->
error(snapshot_event_timeout)
end,

%% Corrupt the checkpoint by truncating the snapshot.dat file.
CPDirname = ra_lib:zpad_hex(2) ++ "_" ++ ra_lib:zpad_hex(55),
CorruptCP = filename:join(?config(checkpoint_dir, Config), CPDirname),
%% Opening a file with 'write' and not 'read' truncates it.
{ok, Fd} = file:open(filename:join(CorruptCP, "snapshot.dat"),
[binary, write, raw]),
ok = file:close(Fd),

{true, State3, [{monitor, process, snapshot_writer, PromotionPid} | _]} =
ra_snapshot:promote_checkpoint(55, State2),
MRef = erlang:monitor(process, PromotionPid),

receive
{'DOWN', MRef, process, PromotionPid, Reason} ->
?assertEqual(
{validation_failed, {error, invalid_format}},
Reason),
_ = ra_snapshot:handle_down(PromotionPid, Reason, State3),
ok
after 1000 ->
error(snapshot_promotion_down_timeout)
end,

?assertNot(filelib:is_dir(CorruptCP)),
Snapshot = filename:join(?config(snapshot_dir, Config), CPDirname),
?assertNot(filelib:is_dir(Snapshot)),

ok.

%%%===================================================================
%%% Helper functions
%%%===================================================================
Expand Down

0 comments on commit 2cb471c

Please sign in to comment.